Overview of This Report

This document shows my R codes that I used to clean and analyze Hooli’s employee survey data. The document is structured as follows:

1. Data preparation & processing for analysis

1.a. Loading Data

library(readxl)
df_original <- read_xlsx("data/sr._people_science_analyst_assignment_dataset_2024.xlsx")
## Warning: Expecting numeric in Q1601 / R1601C17: got 'N/A'
## Warning: Expecting numeric in N2047 / R2047C14: got 'N/A'
## Warning: Expecting numeric in I2380 / R2380C9: got 'N/A'
## Warning: Expecting numeric in E2515 / R2515C5: got 'N/A'

1.b. Data exploration & preparation

  • ‘lea_3’ should be a numeric variable. After checking the data, employee ID M01562’s response is only missing for’lea_3’. Therefore, this ‘N/A’ should be recoded as -1.

  • ‘hiredate’ should be a date variable. It is also important to note that the current format is written differently based on the country that an employee is from (it’s always important to check the entire dataset!!). Australia: Day/Month/Year Denmark: Day/Month/Year France: Day/Month/Year Germany: Day/Month/Year India: Day/Month/Year UK: Day/Month/Year United Kingdom: Day/Month/Year United States: Month/Day/Year USA: Month/Day/Year Canada: Month/Day/Year China: Year/Month/Day So in converting this variable to date-type, I need to specify this format in my code.

#checking the macro-level trends of the dataset 

head(df_original, 5)
## # A tibble: 5 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01434     5     5     4     5     3     1     5     5     5     5     5     5
## 2 M00631     4     4     4     4     4     4     4     4     4     3     3     4
## 3 M00325     5     5     4     5     4     5     5     5     5     4     3     5
## 4 M00805     5     5     5     5     5     5     5     5     5     5     5     5
## 5 M00157     5     5     4     5     5     5     5     5     5     4     3     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## #   hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
tail(df_original, 5)
## # A tibble: 5 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02004     4     4     5     4     3     4     5     4     4     4     5     4
## 2 M02642     4     4     4     4     4     4     4     4     4     4     3     4
## 3 M01255     3     3     1     3     4     3     4     2     3     2     2     5
## 4 M01626     5     5     5     4     4     3     5     5     5     5     5     5
## 5 M02432     4     2     1     3     3     5     5     5     4     4     5     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## #   hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
str(df_original)
## tibble [2,651 × 27] (S3: tbl_df/tbl/data.frame)
##  $ eeid          : chr [1:2651] "M01434" "M00631" "M00325" "M00805" ...
##  $ ali_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ ali_2         : num [1:2651] 5 4 5 5 5 4 3 5 5 5 ...
##  $ ali_3         : num [1:2651] 4 4 4 5 4 3 4 4 4 5 ...
##  $ col_1         : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
##  $ col_2         : num [1:2651] 3 4 4 5 5 4 3 5 4 4 ...
##  $ col_3         : num [1:2651] 1 4 5 5 5 4 3 5 4 5 ...
##  $ eng_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ eng_2         : num [1:2651] 5 4 5 5 5 4 3 4 5 5 ...
##  $ eng_3         : num [1:2651] 5 4 5 5 5 3 4 5 5 4 ...
##  $ eng_4         : num [1:2651] 5 3 4 5 4 4 3 4 5 5 ...
##  $ eng_5         : num [1:2651] 5 3 3 5 3 4 3 5 5 3 ...
##  $ inc_1         : num [1:2651] 5 4 5 5 5 4 4 5 5 5 ...
##  $ inc_2         : num [1:2651] 5 4 5 5 5 3 4 4 4 4 ...
##  $ inc_3         : num [1:2651] 5 4 5 5 5 2 4 4 5 5 ...
##  $ inc_4         : num [1:2651] 5 4 5 4 4 4 4 5 5 4 ...
##  $ inc_5         : num [1:2651] 3 3 5 3 5 4 4 5 5 5 ...
##  $ lea_1         : num [1:2651] 4 4 5 5 5 2 5 5 5 3 ...
##  $ lea_2         : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
##  $ lea_3         : chr [1:2651] "5" "5" "5" "5" ...
##  $ lea_4         : num [1:2651] 5 5 5 5 5 4 5 5 5 5 ...
##  $ age           : chr [1:2651] "35-44" "45-54" "18-24" "45-54" ...
##  $ hiredate      : chr [1:2651] "25/02/2024" "02/10/2022" "20/10/2023" "08/12/2018" ...
##  $ race          : chr [1:2651] NA NA NA "White" ...
##  $ gender        : chr [1:2651] "Male" "Male" "Male" "Male" ...
##  $ manager_status: chr [1:2651] "Non-Manager" "Non-Manager" "Non-Manager" "Manager" ...
##  $ country       : chr [1:2651] "France" "France" "United Kingdom" "United States" ...
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(df_original, main = 'missing data map for the Hooli Survey Data', col = c('purple', 'black'), legend = TRUE)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.

#except for race, most values are not missing!

#Creating a new dataset that I will prepare and process for analysis

df <- df_original 

#Fixing errors in 'lea_3'

table(df_original$lea_3)
## 
##   -1    1    2    3    4    5   99  N/A 
##   12   22   58  197 1147 1213    1    1
df[df$lea_3 == "N/A", ]
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01562     4     3     4     4     4     5     4     5     4     4     3     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <chr>, lea_4 <dbl>, age <chr>,
## #   hiredate <chr>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df$lea_3[df$lea_3 == "N/A"] <- -1
table(df$lea_3)
## 
##   -1    1    2    3    4    5   99 
##   13   22   58  197 1147 1213    1
#transforming 'lea_3' to a numeric variable
df$lea_3 <- as.numeric(df$lea_3)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df <- df %>%
  mutate(hiredate = case_when(
    country %in% c("Australia", "Denmark", "France", "Germany", "India", "UK", "United Kingdom") ~ as.Date(hiredate, format = "%d/%m/%Y"),
    country %in% c("United States", "USA", "Canada") ~ as.Date(hiredate, format = "%m/%d/%Y"),
    country == "China" ~ as.Date(hiredate, format = "%Y/%m/%d"),
    TRUE ~ as.Date(NA)  
  ))

library(summarytools)
dfSummary(df$hiredate)
## df$hiredate was converted to a data frame
## Data Frame Summary  
## df  
## Dimensions: 2651 x 1  
## Duplicates: 1215  
## 
## -----------------------------------------------------------------------------------------------------
## No   Variable   Stats / Values      Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- ---------- ------------------- ---------------------- --------------------- ---------- ---------
## 1    hiredate   min : 2004-12-24    1436 distinct values                     :   2651       0        
##      [Date]     med : 2022-06-30                                             :   (100.0%)   (0.0%)   
##                 max : 2024-02-29                                             :                       
##                 range : 19y 2m 5d                                          : :                       
##                                                                      . . : : :                       
## -----------------------------------------------------------------------------------------------------
  • Handling outliers, ‘99’, in various variables

I identified several outliers, value 99, from these variables: ali_1, ali_2, ali_3, col_1, col_2, col_3, and lea_4. The value 99 is a value that is out of the scale of -1 to 5. I discovered that all these responses come from Denmark. Thus, it was probably a systematic coding error for these responses from Denmark. Given that these are psychological measures, it doesn’t make sense code 99 might denote an exceptional response out of -1 to 5 scale. Thus, I deem 99 as a coding error. Additionally, these employees are less than 1% of the participants in the data and that they completed all the other questions in the survey. Therefore, I recode 99 to -1, the code for missing value.

library(summarytools)
dfSummary(df)
## Data Frame Summary  
## df  
## Dimensions: 2651 x 27  
## Duplicates: 0  
## 
## ----------------------------------------------------------------------------------------------------------------------
## No   Variable         Stats / Values                 Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1    eeid             1. M00001                         1 ( 0.0%)                                 2651       0        
##      [character]      2. M00002                         1 ( 0.0%)                                 (100.0%)   (0.0%)   
##                       3. M00003                         1 ( 0.0%)                                                     
##                       4. M00004                         1 ( 0.0%)                                                     
##                       5. M00005                         1 ( 0.0%)                                                     
##                       6. M00006                         1 ( 0.0%)                                                     
##                       7. M00007                         1 ( 0.0%)                                                     
##                       8. M00008                         1 ( 0.0%)                                                     
##                       9. M00009                         1 ( 0.0%)                                                     
##                       10. M00010                        1 ( 0.0%)                                                     
##                       [ 2641 others ]                2641 (99.6%)           IIIIIIIIIIIIIIIIIII                       
## 
## 2    ali_1            Mean (sd) : 4.3 (2.8)          -1 :   16 ( 0.6%)                            2651       0        
##      [numeric]        min < med < max:               1 :   27 ( 1.0%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :   64 ( 2.4%)                                                 
##                       IQR (CV) : 1 (0.6)             3 :  208 ( 7.8%)       I                                         
##                                                      4 : 1124 (42.4%)       IIIIIIII                                  
##                                                      5 : 1210 (45.6%)       IIIIIIIII                                 
##                                                      99 :    2 ( 0.1%)                                                
## 
## 3    ali_2            Mean (sd) : 4.2 (2.1)          -1 :   23 ( 0.9%)                            2651       0        
##      [numeric]        min < med < max:               1 :   44 ( 1.7%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :   86 ( 3.2%)                                                 
##                       IQR (CV) : 1 (0.5)             3 :  310 (11.7%)       II                                        
##                                                      4 : 1088 (41.0%)       IIIIIIII                                  
##                                                      5 : 1099 (41.5%)       IIIIIIII                                  
##                                                      99 :    1 ( 0.0%)                                                
## 
## 4    ali_3            Mean (sd) : 3.8 (2.2)          -1 :   23 ( 0.9%)                            2651       0        
##      [numeric]        min < med < max:               1 :  101 ( 3.8%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :  232 ( 8.8%)       I                                         
##                       IQR (CV) : 2 (0.6)             3 :  496 (18.7%)       III                                       
##                                                      4 : 1086 (41.0%)       IIIIIIII                                  
##                                                      5 :  712 (26.9%)       IIIII                                     
##                                                      99 :    1 ( 0.0%)                                                
## 
## 5    col_1            Mean (sd) : 4.2 (2.8)          -1 :   10 ( 0.4%)                            2650       1        
##      [numeric]        min < med < max:               1 :   33 ( 1.2%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :  114 ( 4.3%)                                                 
##                       IQR (CV) : 1 (0.7)             3 :  318 (12.0%)       II                                        
##                                                      4 : 1135 (42.8%)       IIIIIIII                                  
##                                                      5 : 1038 (39.2%)       IIIIIII                                   
##                                                      99 :    2 ( 0.1%)                                                
## 
## 6    col_2            Mean (sd) : 3.7 (2.2)          -1 :  76 ( 2.9%)                             2651       0        
##      [numeric]        min < med < max:               1 :  43 ( 1.6%)                              (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 : 130 ( 4.9%)                                                  
##                       IQR (CV) : 1 (0.6)             3 : 853 (32.2%)        IIIIII                                    
##                                                      4 : 900 (33.9%)        IIIIII                                    
##                                                      5 : 648 (24.4%)        IIII                                      
##                                                      99 :   1 ( 0.0%)                                                 
## 
## 7    col_3            Mean (sd) : 4.1 (2.1)          -1 :   24 ( 0.9%)                            2651       0        
##      [numeric]        min < med < max:               1 :   55 ( 2.1%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :  128 ( 4.8%)                                                 
##                       IQR (CV) : 1 (0.5)             3 :  327 (12.3%)       II                                        
##                                                      4 : 1037 (39.1%)       IIIIIII                                   
##                                                      5 : 1079 (40.7%)       IIIIIIII                                  
##                                                      99 :    1 ( 0.0%)                                                
## 
## 8    eng_1            Mean (sd) : 4.5 (0.7)          -1 :    6 ( 0.2%)                            2651       0        
##      [numeric]        min < med < max:               1 :    5 ( 0.2%)                             (100.0%)   (0.0%)   
##                       -1 < 5 < 5                     2 :   23 ( 0.9%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  181 ( 6.8%)       I                                         
##                                                      4 :  869 (32.8%)       IIIIII                                    
##                                                      5 : 1567 (59.1%)       IIIIIIIIIII                               
## 
## 9    eng_2            Mean (sd) : 4.3 (0.9)          -1 :    4 ( 0.2%)                            2650       1        
##      [numeric]        min < med < max:               1 :   48 ( 1.8%)                             (100.0%)   (0.0%)   
##                       -1 < 5 < 5                     2 :  104 ( 3.9%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  243 ( 9.2%)       I                                         
##                                                      4 :  827 (31.2%)       IIIIII                                    
##                                                      5 : 1424 (53.7%)       IIIIIIIIII                                
## 
## 10   eng_3            Mean (sd) : 4.1 (1)            -1 :   12 ( 0.5%)                            2651       0        
##      [numeric]        min < med < max:               1 :   51 ( 1.9%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :   95 ( 3.6%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  379 (14.3%)       II                                        
##                                                      4 :  995 (37.5%)       IIIIIII                                   
##                                                      5 : 1119 (42.2%)       IIIIIIII                                  
## 
## 11   eng_4            Mean (sd) : 3.9 (1.1)          -1 :   8 ( 0.3%)                             2651       0        
##      [numeric]        min < med < max:               1 :  60 ( 2.3%)                              (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 : 249 ( 9.4%)        I                                         
##                       IQR (CV) : 2 (0.3)             3 : 523 (19.7%)        III                                       
##                                                      4 : 950 (35.8%)        IIIIIII                                   
##                                                      5 : 861 (32.5%)        IIIIII                                    
## 
## 12   eng_5            Mean (sd) : 4 (1)              -1 :    8 ( 0.3%)                            2651       0        
##      [numeric]        min < med < max:               1 :   55 ( 2.1%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :  163 ( 6.1%)       I                                         
##                       IQR (CV) : 2 (0.3)             3 :  516 (19.5%)       III                                       
##                                                      4 :  896 (33.8%)       IIIIII                                    
##                                                      5 : 1013 (38.2%)       IIIIIII                                   
## 
## 13   inc_1            Mean (sd) : 4.4 (0.9)          -1 :   19 ( 0.7%)                            2651       0        
##      [numeric]        min < med < max:               1 :   26 ( 1.0%)                             (100.0%)   (0.0%)   
##                       -1 < 5 < 5                     2 :   52 ( 2.0%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  196 ( 7.4%)       I                                         
##                                                      4 :  828 (31.2%)       IIIIII                                    
##                                                      5 : 1530 (57.7%)       IIIIIIIIIII                               
## 
## 14   inc_2            Mean (sd) : 3.9 (1.2)          -1 :  78 ( 2.9%)                             2650       1        
##      [numeric]        min < med < max:               1 :  30 ( 1.1%)                              (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 : 103 ( 3.9%)                                                  
##                       IQR (CV) : 2 (0.3)             3 : 575 (21.7%)        IIII                                      
##                                                      4 : 908 (34.3%)        IIIIII                                    
##                                                      5 : 956 (36.1%)        IIIIIII                                   
## 
## 15   inc_3            Mean (sd) : 4.1 (1.1)          -1 :   50 ( 1.9%)                            2651       0        
##      [numeric]        min < med < max:               1 :   28 ( 1.1%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :   91 ( 3.4%)                                                 
##                       IQR (CV) : 1 (0.3)             3 :  370 (14.0%)       II                                        
##                                                      4 :  916 (34.6%)       IIIIII                                    
##                                                      5 : 1196 (45.1%)       IIIIIIIII                                 
## 
## 16   inc_4            Mean (sd) : 3.9 (1.2)          -1 :  64 ( 2.4%)                             2651       0        
##      [numeric]        min < med < max:               1 :  33 ( 1.2%)                              (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 : 110 ( 4.1%)                                                  
##                       IQR (CV) : 2 (0.3)             3 : 634 (23.9%)        IIII                                      
##                                                      4 : 895 (33.8%)        IIIIII                                    
##                                                      5 : 915 (34.5%)        IIIIII                                    
## 
## 17   inc_5            Mean (sd) : 4 (1.2)            -1 :   46 ( 1.7%)                            2650       1        
##      [numeric]        min < med < max:               1 :   71 ( 2.7%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :  125 ( 4.7%)                                                 
##                       IQR (CV) : 1 (0.3)             3 :  390 (14.7%)       II                                        
##                                                      4 : 1018 (38.4%)       IIIIIII                                   
##                                                      5 : 1000 (37.7%)       IIIIIII                                   
## 
## 18   lea_1            Mean (sd) : 4.2 (1)            -1 :   20 ( 0.8%)                            2651       0        
##      [numeric]        min < med < max:               1 :   24 ( 0.9%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :   79 ( 3.0%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  335 (12.6%)       II                                        
##                                                      4 : 1074 (40.5%)       IIIIIIII                                  
##                                                      5 : 1119 (42.2%)       IIIIIIII                                  
## 
## 19   lea_2            Mean (sd) : 4.2 (1)            -1 :   19 ( 0.7%)                            2651       0        
##      [numeric]        min < med < max:               1 :   35 ( 1.3%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 5                     2 :   75 ( 2.8%)                                                 
##                       IQR (CV) : 1 (0.2)             3 :  287 (10.8%)       II                                        
##                                                      4 :  979 (36.9%)       IIIIIII                                   
##                                                      5 : 1256 (47.4%)       IIIIIIIII                                 
## 
## 20   lea_3            Mean (sd) : 4.3 (2)            -1 :   13 ( 0.5%)                            2651       0        
##      [numeric]        min < med < max:               1 :   22 ( 0.8%)                             (100.0%)   (0.0%)   
##                       -1 < 4 < 99                    2 :   58 ( 2.2%)                                                 
##                       IQR (CV) : 1 (0.5)             3 :  197 ( 7.4%)       I                                         
##                                                      4 : 1147 (43.3%)       IIIIIIII                                  
##                                                      5 : 1213 (45.8%)       IIIIIIIII                                 
##                                                      99 :    1 ( 0.0%)                                                
## 
## 21   lea_4            Mean (sd) : 4.4 (2)            -1 :   16 ( 0.6%)                            2651       0        
##      [numeric]        min < med < max:               1 :   15 ( 0.6%)                             (100.0%)   (0.0%)   
##                       -1 < 5 < 99                    2 :   41 ( 1.5%)                                                 
##                       IQR (CV) : 1 (0.5)             3 :  267 (10.1%)       II                                        
##                                                      4 :  930 (35.1%)       IIIIIII                                   
##                                                      5 : 1381 (52.1%)       IIIIIIIIII                                
##                                                      99 :    1 ( 0.0%)                                                
## 
## 22   age              1. 25-34                       1266 (47.8%)           IIIIIIIII             2651       0        
##      [character]      2. 35-44                        795 (30.0%)           IIIII                 (100.0%)   (0.0%)   
##                       3. 45-54                        301 (11.4%)           II                                        
##                       4. 18-24                        147 ( 5.5%)           I                                         
##                       5. 55-64                         78 ( 2.9%)                                                     
##                       6. 65+                           54 ( 2.0%)                                                     
##                       7. 26                             2 ( 0.1%)                                                     
##                       8. N/A                            2 ( 0.1%)                                                     
##                       9. 19                             1 ( 0.0%)                                                     
##                       10. 21                            1 ( 0.0%)                                                     
##                       [ 4 others ]                      4 ( 0.2%)                                                     
## 
## 23   hiredate         min : 2004-12-24               1436 distinct values                     :   2651       0        
##      [Date]           med : 2022-06-30                                                        :   (100.0%)   (0.0%)   
##                       max : 2024-02-29                                                        :                       
##                       range : 19y 2m 5d                                                     : :                       
##                                                                                       . . : : :                       
## 
## 24   race             1. American Indian/Alaskan N      2 ( 0.1%)                                 1584       1067     
##      [character]      2. Asian                        359 (22.7%)           IIII                  (59.8%)    (40.2%)  
##                       3. Black or African American     37 ( 2.3%)                                                     
##                       4. Black or African American     11 ( 0.7%)                                                     
##                       5. Hispanic or Latino            80 ( 5.1%)           I                                         
##                       6. Native Hawaiian or Other       2 ( 0.1%)                                                     
##                       7. Two or More Races             43 ( 2.7%)                                                     
##                       8. White                       1050 (66.3%)           IIIIIIIIIIIII                             
## 
## 25   gender           1. Female                       816 (30.8%)           IIIIII                2651       0        
##      [character]      2. Male                        1826 (68.9%)           IIIIIIIIIIIII         (100.0%)   (0.0%)   
##                       3. Man                            4 ( 0.2%)                                                     
##                       4. Unknown                        1 ( 0.0%)                                                     
##                       5. Woman                          4 ( 0.2%)                                                     
## 
## 26   manager_status   1. Manager                      654 (24.7%)           IIII                  2651       0        
##      [character]      2. Non-Manager                 1997 (75.3%)           IIIIIIIIIIIIIII       (100.0%)   (0.0%)   
## 
## 27   country          1. Australia                    101 ( 3.8%)                                 2651       0        
##      [character]      2. Canada                        84 ( 3.2%)                                 (100.0%)   (0.0%)   
##                       3. China                         60 ( 2.3%)                                                     
##                       4. Denmark                       24 ( 0.9%)                                                     
##                       5. France                       136 ( 5.1%)           I                                         
##                       6. Germany                       48 ( 1.8%)                                                     
##                       7. India                        288 (10.9%)           II                                        
##                       8. UK                            11 ( 0.4%)                                                     
##                       9. United Kingdom               348 (13.1%)           II                                        
##                       10. United States              1520 (57.3%)           IIIIIIIIIII                               
##                       11. USA                          31 ( 1.2%)                                                     
## ----------------------------------------------------------------------------------------------------------------------
#examine the responses of the employees who responded '99' to one of these questions.
df[df$ali_1 == 99, ] # other responses are in the normal scale range & 2 participants = M00238 & M00568
## # A tibble: 2 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00238    99     4     4     4     4     4     5     3     4     3     3     5
## 2 M00568    99     5     5     5     3     5     5     5     5     5     5     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$ali_2 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01455     5    99     3     5     4     2     5     5     5     4     4     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$ali_3 == 99, ] # other responses are in the normal scale range & 1 participant = M02319
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02319     4     4    99     4     3     4     4     5     4     4     4     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$col_1 == 99, ] # other responses are in the normal scale range & 2 participants = M01339, M01843
## # A tibble: 3 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01339     5     4     3    99     2     3     5     4     5     4     4     5
## 2 M01843     4     3     4    99     4     2     4     3     3     5     4     4
## 3 <NA>      NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$col_2 == 99, ] # other responses are in the normal scale range & 1 participant = M00168
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00168     4     4     4     2    99     3     5     4     5     4     4     4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$col_3 == 99, ] # other responses are in the normal scale range & 1 participant = M01393
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01393     3     3     3     4     3    99     5     5     5     5     5     3
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$lea_3 == 99, ] # except fo lea_3, all the other responses are in the normal scale range & 1 participant = M01484
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01484     5     5     4     5     5     5     5     5     5     5     5     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[df$lea_4 == 99, ] # except for ali_2 & lea_4, all the other responses are in the normal scale range & 1 participant = M01455
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01455     5    99     3     5     4     2     5     5     5     4     4     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
# recode values with 99 to -1.

df$ali_1[df$ali_1 == 99] <- -1
df$ali_2[df$ali_2 == 99] <- -1
df$ali_3[df$ali_3 == 99] <- -1
df$col_1[df$col_1 == 99] <- -1
df$col_2[df$col_2 == 99] <- -1
df$col_3[df$col_3 == 99] <- -1
df$lea_3[df$lea_3 == 99] <- -1
df$lea_4[df$lea_4 == 99] <- -1
  • col_1, eng_2, inc_2, & inc_5 have 1 missing value each. It should be correctly coded as -1, the correct code for missing values.
df[is.na(df$col_1), ] #for employee_ID M02429, only col_1 is missing. It should be coded as -1. 
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M02429     5     5     4    NA     5     5     4     3     2     4     4     5
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[is.na(df$eng_2), ] #for employee_ID M01597, only eng_2 is missing. It should be coded as -1.
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01597     4     3     3     4     4     4     3    NA     3     3     3     4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[is.na(df$inc_2), ] #for employee_ID M01481, only inc_2 is missing. It should be coded as -1.
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01481     3     4     4     3     4     4     4     3     3     4     4     4
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df[is.na(df$inc_5), ] #for employee_ID M01722, only inc_5 is missing. It should be coded as -1. 
## # A tibble: 1 × 27
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01722     4     4     4     4     4     4     4     3     4     4     4     3
## # ℹ 14 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>
df$col_1[is.na(df$col_1)] <- -1
df$eng_2[is.na(df$eng_2)] <- -1
df$inc_2[is.na(df$inc_2)] <- -1
df$inc_5[is.na(df$inc_5)] <- -1
  • Checking for duplicates: there are no duplicates based on employee ID. each response comes from a unique employee
sum(duplicated(df$eeid))
## [1] 0
  • For the ‘country’ variable, ‘UK’ and ‘United Kingdom’ should be merged into one category. Similarly, ‘USA’ and ‘United States’ should be merged into one category. Let’s combine these categories for each country.
table(df$country)
## 
##      Australia         Canada          China        Denmark         France 
##            101             84             60             24            136 
##        Germany          India             UK United Kingdom  United States 
##             48            288             11            348           1520 
##            USA 
##             31
df$country[df$country == "UK"] <- "United Kingdom"
df$country[df$country == "USA"] <- "United States"
table(df$country)
## 
##      Australia         Canada          China        Denmark         France 
##            101             84             60             24            136 
##        Germany          India United Kingdom  United States 
##             48            288            359           1551
  • For ‘age’ variable, multiple categories need to be consolidated:
    • the group ‘19’, ‘21’ should be consolidated into ‘18-24’.
    • the group ‘26’, ‘28’ should be consolidated into ‘25-34’.
    • the group ‘39’, ‘42’ should be consolidated into ‘35-44’.
    • the group ‘48’ should be consolidated into ‘45-54’.
table(df$age, useNA = "always")
## 
## 18-24    19    21 25-34    26    28 35-44    39    42 45-54    48 55-64   65+ 
##   147     1     1  1266     2     1   795     1     1   301     1    78    54 
##   N/A  <NA> 
##     2     0
df$age[df$age == "19"] <- "18-24"
df$age[df$age == "21"] <- "18-24"
df$age[df$age == "26"] <- "25-34"
df$age[df$age == "28"] <- "25-34"
df$age[df$age == "39"] <- "35-44"
df$age[df$age == "42"] <- "35-44"
df$age[df$age == "48"] <- "45-54"
  • For ‘race’ variable, “Black or African American” & “Black or African Americans” should be mergedinto one group.
table(df$race, useNA = "always")
## 
##            American Indian/Alaskan Native 
##                                         2 
##                                     Asian 
##                                       359 
##                 Black or African American 
##                                        37 
##                Black or African Americans 
##                                        11 
##                        Hispanic or Latino 
##                                        80 
## Native Hawaiian or Other Pacific Islander 
##                                         2 
##                         Two or More Races 
##                                        43 
##                                     White 
##                                      1050 
##                                      <NA> 
##                                      1067
df$race[df$race == "Black or African Americans"] <- "Black or African American"
  • For ‘gender’ variable, these groups below need to be consolidated: -Consolidate “Male” & “Man” into one group -Consolidate “Female” & “Woman” into one group
table(df$gender, useNA = "always")
## 
##  Female    Male     Man Unknown   Woman    <NA> 
##     816    1826       4       1       4       0
df$gender[df$gender == "Man"] <- "Male"
df$gender[df$gender == "Woman"] <- "Female"
  • 40.2% of race variable’s observations (1067) are missing. Let’s see if the values in this variable has patterns.

    Based on the contingency table below,

    • 101/101 employees in Australia have no information about their race.
    • 60/84 employees in Canada have no information about their race except 24 who are white.
    • 55/60 employees in China have no information about their race except 5 who are white.
    • 24/24 employees in Denmark have no information about their race.
    • 136/136 employees in France have no information about their race.
    • 48/48 employees in Germany have no information about their race.
    • 288/288 employees in India have no information about their race.
    • 355/359 employees in the United Kingdom have no information about their race.
    • 1017/1017 employees in the United States have demographic information about race.

Except for the United States, all the countries with missing information about race are largely ethnically homogeneous (e.g., Denmark). Therefore, the company might not collect information about employee’s race as the variance in race among employees might be too small to be a meaningful (For example, according to demographic research, about 90.2 percent of the population in Australia are white). Therefore, conducting an analysis about race for other countries except for the United States would not provide meaningful insights to Hooli.

c_t <- table(df$race, df$country, useNA = "always")
c_t
##                                            
##                                             Australia Canada China Denmark
##   American Indian/Alaskan Native                    0      0     0       0
##   Asian                                             0      0     0       0
##   Black or African American                         0      0     0       0
##   Hispanic or Latino                                0      0     0       0
##   Native Hawaiian or Other Pacific Islander         0      0     0       0
##   Two or More Races                                 0      0     0       0
##   White                                             0     24     5       0
##   <NA>                                            101     60    55      24
##                                            
##                                             France Germany India United Kingdom
##   American Indian/Alaskan Native                 0       0     0              0
##   Asian                                          0       0     0              0
##   Black or African American                      0       0     0              0
##   Hispanic or Latino                             0       0     0              0
##   Native Hawaiian or Other Pacific Islander      0       0     0              0
##   Two or More Races                              0       0     0              0
##   White                                          0       0     0              4
##   <NA>                                         136      48   288            355
##                                            
##                                             United States <NA>
##   American Indian/Alaskan Native                        2    0
##   Asian                                               359    0
##   Black or African American                            48    0
##   Hispanic or Latino                                   80    0
##   Native Hawaiian or Other Pacific Islander             2    0
##   Two or More Races                                    43    0
##   White                                              1017    0
##   <NA>                                                  0    0
* 0% of 'hiredate' variable's observations are missing. 
df_2 <- subset(df, select = c(hiredate))
library(Amelia)
missmap(df_2)
## Warning: Unknown or uninitialised column: `arguments`.
## Unknown or uninitialised column: `arguments`.
## Warning: Unknown or uninitialised column: `imputations`.
rm(df_2)

#the earliest & latest date for hiredate
min(df$hiredate, na.rm = TRUE) # 2014-12-24
## [1] "2004-12-24"
max(df$hiredate, na.rm = TRUE) # 2024-02-29
## [1] "2024-02-29"
#let's create a new variable named hireyear 
df$hireyear <- format(df$hiredate, "%Y")
table(df$hireyear, useNA = "always")
## 
## 2004 2005 2006 2010 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 <NA> 
##    1    1    1    1   41   40   31   42  125  129  307  308  632  954   38    0
library(ggplot2)

ggplot(df, aes(x=hireyear)) + geom_bar()

ggplot(df, aes(x=hiredate)) + geom_histogram() #this company hired employees a ton in 2022!!
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

* Create a new variable called ‘tenure’ by using hiredate.

  Substract 'hiredate' from March 1st 2024, the date instructed by the technical assessment rubric to create different tenure groups by using 'hiredate'. It's important to note that there are 11 employees who were hired on or after March 1st 2024. However, given that I am asked to analyze the data as of March 1st 2024, I omit these 11 observations to not introduce a bias in my data. For example, introducing this 11 observations into under 3 months might change the characteristics of those in that group as of March 1st 2024. 
  
  - Given that there are only 4 employees in '10+ years' groups, we do not report any findings about this group.
  
#let's create the cutpoint, which is 2024-03-01, to create tenure groups
cutpoint <- as.Date("2024-03-01")
#given that the average number of days in a month is 365/12 = 30.4166666667, 
tenure_days <- as.numeric(difftime(cutpoint, df$hiredate, units = "days"))
tenure_months <- as.numeric(difftime(cutpoint, df$hiredate, units = "days")/(365/12))

#create tenure_groups 
tenure_groups <- cut(tenure_months,
                      breaks = c(0, 3, 6, 12, 24, 48, 72, 120, Inf),  # In months
                      labels = c("Under 3 months", "3-6 months", "6-12 months", "1-2 years", "2-4 years", "4-6 years", "6-10 years", "10+ years"), right = FALSE) # right = FALSE to EXCLUDE the right side month in the cateogry

#the distribution of employees in each tenure group
table(tenure_groups, useNA = "always")
## tenure_groups
## Under 3 months     3-6 months    6-12 months      1-2 years      2-4 years 
##             64            325            507            670            644 
##      4-6 years     6-10 years      10+ years           <NA> 
##            274            163              4              0
# Add tenure_group variable to data frame
df$tenure_group <- as.character(tenure_groups)

# Print the data frame with the new tenure_group variable
table(df$tenure_group, useNA = "always")
## 
##      1-2 years      10+ years      2-4 years     3-6 months      4-6 years 
##            670              4            644            325            274 
##     6-10 years    6-12 months Under 3 months           <NA> 
##            163            507             64              0
  • Changing all the missing values to NA (I named this new dataset df_2) to make sure that ‘-1’ does not influence the results of our analysis.
df_2 <- df

df_2$ali_1[df_2$ali_1 == -1] <- NA 
df_2$ali_2[df_2$ali_2 == -1] <- NA
df_2$ali_3[df_2$ali_3 == -1] <- NA

df_2$col_1[df_2$col_1 == -1] <- NA 
df_2$col_2[df_2$col_2 == -1] <- NA 
df_2$col_3[df_2$col_3 == -1] <- NA 

df_2$eng_1[df_2$eng_1 == -1] <- NA 
df_2$eng_2[df_2$eng_2 == -1] <- NA
df_2$eng_3[df_2$eng_3 == -1] <- NA 
df_2$eng_4[df_2$eng_4 == -1] <- NA
df_2$eng_5[df_2$eng_5 == -1] <- NA

df_2$inc_1[df_2$inc_1 == -1] <- NA 
df_2$inc_2[df_2$inc_2 == -1] <- NA
df_2$inc_3[df_2$inc_3 == -1] <- NA
df_2$inc_4[df_2$inc_4 == -1] <- NA
df_2$inc_5[df_2$inc_5 == -1] <- NA 

df_2$lea_1[df_2$lea_1 == -1] <- NA
df_2$lea_2[df_2$lea_2 == -1] <- NA
df_2$lea_3[df_2$lea_3 == -1] <- NA
df_2$lea_4[df_2$lea_4 == -1] <- NA

df_2$age[df_2$age == "N/A"] <- NA

#we convert "Unknown" to NA as this category has less than 5 people and we do not report.
df_2$gender[df_2$gender == "Unknown"] <- NA

dfSummary(df_2)
## Data Frame Summary  
## df_2  
## Dimensions: 2651 x 29  
## Duplicates: 0  
## 
## ----------------------------------------------------------------------------------------------------------------------
## No   Variable         Stats / Values                 Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1    eeid             1. M00001                         1 ( 0.0%)                                 2651       0        
##      [character]      2. M00002                         1 ( 0.0%)                                 (100.0%)   (0.0%)   
##                       3. M00003                         1 ( 0.0%)                                                     
##                       4. M00004                         1 ( 0.0%)                                                     
##                       5. M00005                         1 ( 0.0%)                                                     
##                       6. M00006                         1 ( 0.0%)                                                     
##                       7. M00007                         1 ( 0.0%)                                                     
##                       8. M00008                         1 ( 0.0%)                                                     
##                       9. M00009                         1 ( 0.0%)                                                     
##                       10. M00010                        1 ( 0.0%)                                                     
##                       [ 2641 others ]                2641 (99.6%)           IIIIIIIIIIIIIIIIIII                       
## 
## 2    ali_1            Mean (sd) : 4.3 (0.8)          1 :   27 ( 1.0%)                             2633       18       
##      [numeric]        min < med < max:               2 :   64 ( 2.4%)                             (99.3%)    (0.7%)   
##                       1 < 4 < 5                      3 :  208 ( 7.9%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 : 1124 (42.7%)       IIIIIIII                                  
##                                                      5 : 1210 (46.0%)       IIIIIIIII                                 
## 
## 3    ali_2            Mean (sd) : 4.2 (0.9)          1 :   44 ( 1.7%)                             2627       24       
##      [numeric]        min < med < max:               2 :   86 ( 3.3%)                             (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  310 (11.8%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1088 (41.4%)       IIIIIIII                                  
##                                                      5 : 1099 (41.8%)       IIIIIIII                                  
## 
## 4    ali_3            Mean (sd) : 3.8 (1.1)          1 :  101 ( 3.8%)                             2627       24       
##      [numeric]        min < med < max:               2 :  232 ( 8.8%)       I                     (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  496 (18.9%)       III                                       
##                       IQR (CV) : 2 (0.3)             4 : 1086 (41.3%)       IIIIIIII                                  
##                                                      5 :  712 (27.1%)       IIIII                                     
## 
## 5    col_1            Mean (sd) : 4.1 (0.9)          1 :   33 ( 1.3%)                             2638       13       
##      [numeric]        min < med < max:               2 :  114 ( 4.3%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  318 (12.1%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1135 (43.0%)       IIIIIIII                                  
##                                                      5 : 1038 (39.3%)       IIIIIII                                   
## 
## 6    col_2            Mean (sd) : 3.8 (0.9)          1 :  43 ( 1.7%)                              2574       77       
##      [numeric]        min < med < max:               2 : 130 ( 5.1%)        I                     (97.1%)    (2.9%)   
##                       1 < 4 < 5                      3 : 853 (33.1%)        IIIIII                                    
##                       IQR (CV) : 2 (0.2)             4 : 900 (35.0%)        IIIIII                                    
##                                                      5 : 648 (25.2%)        IIIII                                     
## 
## 7    col_3            Mean (sd) : 4.1 (1)            1 :   55 ( 2.1%)                             2626       25       
##      [numeric]        min < med < max:               2 :  128 ( 4.9%)                             (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  327 (12.5%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1037 (39.5%)       IIIIIII                                   
##                                                      5 : 1079 (41.1%)       IIIIIIII                                  
## 
## 8    eng_1            Mean (sd) : 4.5 (0.7)          1 :    5 ( 0.2%)                             2645       6        
##      [numeric]        min < med < max:               2 :   23 ( 0.9%)                             (99.8%)    (0.2%)   
##                       1 < 5 < 5                      3 :  181 ( 6.8%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  869 (32.9%)       IIIIII                                    
##                                                      5 : 1567 (59.2%)       IIIIIIIIIII                               
## 
## 9    eng_2            Mean (sd) : 4.3 (0.9)          1 :   48 ( 1.8%)                             2646       5        
##      [numeric]        min < med < max:               2 :  104 ( 3.9%)                             (99.8%)    (0.2%)   
##                       1 < 5 < 5                      3 :  243 ( 9.2%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  827 (31.3%)       IIIIII                                    
##                                                      5 : 1424 (53.8%)       IIIIIIIIII                                
## 
## 10   eng_3            Mean (sd) : 4.2 (0.9)          1 :   51 ( 1.9%)                             2639       12       
##      [numeric]        min < med < max:               2 :   95 ( 3.6%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  379 (14.4%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  995 (37.7%)       IIIIIII                                   
##                                                      5 : 1119 (42.4%)       IIIIIIII                                  
## 
## 11   eng_4            Mean (sd) : 3.9 (1)            1 :  60 ( 2.3%)                              2643       8        
##      [numeric]        min < med < max:               2 : 249 ( 9.4%)        I                     (99.7%)    (0.3%)   
##                       1 < 4 < 5                      3 : 523 (19.8%)        III                                       
##                       IQR (CV) : 2 (0.3)             4 : 950 (35.9%)        IIIIIII                                   
##                                                      5 : 861 (32.6%)        IIIIII                                    
## 
## 12   eng_5            Mean (sd) : 4 (1)              1 :   55 ( 2.1%)                             2643       8        
##      [numeric]        min < med < max:               2 :  163 ( 6.2%)       I                     (99.7%)    (0.3%)   
##                       1 < 4 < 5                      3 :  516 (19.5%)       III                                       
##                       IQR (CV) : 2 (0.3)             4 :  896 (33.9%)       IIIIII                                    
##                                                      5 : 1013 (38.3%)       IIIIIII                                   
## 
## 13   inc_1            Mean (sd) : 4.4 (0.8)          1 :   26 ( 1.0%)                             2632       19       
##      [numeric]        min < med < max:               2 :   52 ( 2.0%)                             (99.3%)    (0.7%)   
##                       1 < 5 < 5                      3 :  196 ( 7.4%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  828 (31.5%)       IIIIII                                    
##                                                      5 : 1530 (58.1%)       IIIIIIIIIII                               
## 
## 14   inc_2            Mean (sd) : 4 (0.9)            1 :  30 ( 1.2%)                              2572       79       
##      [numeric]        min < med < max:               2 : 103 ( 4.0%)                              (97.0%)    (3.0%)   
##                       1 < 4 < 5                      3 : 575 (22.4%)        IIII                                      
##                       IQR (CV) : 2 (0.2)             4 : 908 (35.3%)        IIIIIII                                   
##                                                      5 : 956 (37.2%)        IIIIIII                                   
## 
## 15   inc_3            Mean (sd) : 4.2 (0.9)          1 :   28 ( 1.1%)                             2601       50       
##      [numeric]        min < med < max:               2 :   91 ( 3.5%)                             (98.1%)    (1.9%)   
##                       1 < 4 < 5                      3 :  370 (14.2%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  916 (35.2%)       IIIIIII                                   
##                                                      5 : 1196 (46.0%)       IIIIIIIII                                 
## 
## 16   inc_4            Mean (sd) : 4 (0.9)            1 :  33 ( 1.3%)                              2587       64       
##      [numeric]        min < med < max:               2 : 110 ( 4.3%)                              (97.6%)    (2.4%)   
##                       1 < 4 < 5                      3 : 634 (24.5%)        IIII                                      
##                       IQR (CV) : 2 (0.2)             4 : 895 (34.6%)        IIIIII                                    
##                                                      5 : 915 (35.4%)        IIIIIII                                   
## 
## 17   inc_5            Mean (sd) : 4.1 (1)            1 :   71 ( 2.7%)                             2604       47       
##      [numeric]        min < med < max:               2 :  125 ( 4.8%)                             (98.2%)    (1.8%)   
##                       1 < 4 < 5                      3 :  390 (15.0%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1018 (39.1%)       IIIIIII                                   
##                                                      5 : 1000 (38.4%)       IIIIIII                                   
## 
## 18   lea_1            Mean (sd) : 4.2 (0.8)          1 :   24 ( 0.9%)                             2631       20       
##      [numeric]        min < med < max:               2 :   79 ( 3.0%)                             (99.2%)    (0.8%)   
##                       1 < 4 < 5                      3 :  335 (12.7%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1074 (40.8%)       IIIIIIII                                  
##                                                      5 : 1119 (42.5%)       IIIIIIII                                  
## 
## 19   lea_2            Mean (sd) : 4.3 (0.9)          1 :   35 ( 1.3%)                             2632       19       
##      [numeric]        min < med < max:               2 :   75 ( 2.8%)                             (99.3%)    (0.7%)   
##                       1 < 4 < 5                      3 :  287 (10.9%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  979 (37.2%)       IIIIIII                                   
##                                                      5 : 1256 (47.7%)       IIIIIIIII                                 
## 
## 20   lea_3            Mean (sd) : 4.3 (0.8)          1 :   22 ( 0.8%)                             2637       14       
##      [numeric]        min < med < max:               2 :   58 ( 2.2%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  197 ( 7.5%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 : 1147 (43.5%)       IIIIIIII                                  
##                                                      5 : 1213 (46.0%)       IIIIIIIII                                 
## 
## 21   lea_4            Mean (sd) : 4.4 (0.8)          1 :   15 ( 0.6%)                             2634       17       
##      [numeric]        min < med < max:               2 :   41 ( 1.6%)                             (99.4%)    (0.6%)   
##                       1 < 5 < 5                      3 :  267 (10.1%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  930 (35.3%)       IIIIIII                                   
##                                                      5 : 1381 (52.4%)       IIIIIIIIII                                
## 
## 22   age              1. 18-24                        149 ( 5.6%)           I                     2649       2        
##      [character]      2. 25-34                       1269 (47.9%)           IIIIIIIII             (99.9%)    (0.1%)   
##                       3. 35-44                        797 (30.1%)           IIIIII                                    
##                       4. 45-54                        302 (11.4%)           II                                        
##                       5. 55-64                         78 ( 2.9%)                                                     
##                       6. 65+                           54 ( 2.0%)                                                     
## 
## 23   hiredate         min : 2004-12-24               1436 distinct values                     :   2651       0        
##      [Date]           med : 2022-06-30                                                        :   (100.0%)   (0.0%)   
##                       max : 2024-02-29                                                        :                       
##                       range : 19y 2m 5d                                                     : :                       
##                                                                                       . . : : :                       
## 
## 24   race             1. American Indian/Alaskan N      2 ( 0.1%)                                 1584       1067     
##      [character]      2. Asian                        359 (22.7%)           IIII                  (59.8%)    (40.2%)  
##                       3. Black or African American     48 ( 3.0%)                                                     
##                       4. Hispanic or Latino            80 ( 5.1%)           I                                         
##                       5. Native Hawaiian or Other       2 ( 0.1%)                                                     
##                       6. Two or More Races             43 ( 2.7%)                                                     
##                       7. White                       1050 (66.3%)           IIIIIIIIIIIII                             
## 
## 25   gender           1. Female                       820 (30.9%)           IIIIII                2650       1        
##      [character]      2. Male                        1830 (69.1%)           IIIIIIIIIIIII         (100.0%)   (0.0%)   
## 
## 26   manager_status   1. Manager                      654 (24.7%)           IIII                  2651       0        
##      [character]      2. Non-Manager                 1997 (75.3%)           IIIIIIIIIIIIIII       (100.0%)   (0.0%)   
## 
## 27   country          1. Australia                    101 ( 3.8%)                                 2651       0        
##      [character]      2. Canada                        84 ( 3.2%)                                 (100.0%)   (0.0%)   
##                       3. China                         60 ( 2.3%)                                                     
##                       4. Denmark                       24 ( 0.9%)                                                     
##                       5. France                       136 ( 5.1%)           I                                         
##                       6. Germany                       48 ( 1.8%)                                                     
##                       7. India                        288 (10.9%)           II                                        
##                       8. United Kingdom               359 (13.5%)           II                                        
##                       9. United States               1551 (58.5%)           IIIIIIIIIII                               
## 
## 28   hireyear         1. 2023                        954 (36.0%)            IIIIIII               2651       0        
##      [character]      2. 2022                        632 (23.8%)            IIII                  (100.0%)   (0.0%)   
##                       3. 2021                        308 (11.6%)            II                                        
##                       4. 2020                        307 (11.6%)            II                                        
##                       5. 2019                        129 ( 4.9%)                                                      
##                       6. 2018                        125 ( 4.7%)                                                      
##                       7. 2017                         42 ( 1.6%)                                                      
##                       8. 2014                         41 ( 1.5%)                                                      
##                       9. 2015                         40 ( 1.5%)                                                      
##                       10. 2024                        38 ( 1.4%)                                                      
##                       [ 5 others ]                    35 ( 1.3%)                                                      
## 
## 29   tenure_group     1. 1-2 years                   670 (25.3%)            IIIII                 2651       0        
##      [character]      2. 10+ years                     4 ( 0.2%)                                  (100.0%)   (0.0%)   
##                       3. 2-4 years                   644 (24.3%)            IIII                                      
##                       4. 3-6 months                  325 (12.3%)            II                                        
##                       5. 4-6 years                   274 (10.3%)            II                                        
##                       6. 6-10 years                  163 ( 6.1%)            I                                         
##                       7. 6-12 months                 507 (19.1%)            III                                       
##                       8. Under 3 months               64 ( 2.4%)                                                      
## ----------------------------------------------------------------------------------------------------------------------
#the end of data-processing & preparation for analysis

# 2. Data Analysis

    1. Calculate the favorable scores for all questions and factors for Hooli Overall.
      1. The favorability scores for all questions below:
dfSummary(df_2)
## Data Frame Summary  
## df_2  
## Dimensions: 2651 x 29  
## Duplicates: 0  
## 
## ----------------------------------------------------------------------------------------------------------------------
## No   Variable         Stats / Values                 Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- ---------------- ------------------------------ ---------------------- --------------------- ---------- ---------
## 1    eeid             1. M00001                         1 ( 0.0%)                                 2651       0        
##      [character]      2. M00002                         1 ( 0.0%)                                 (100.0%)   (0.0%)   
##                       3. M00003                         1 ( 0.0%)                                                     
##                       4. M00004                         1 ( 0.0%)                                                     
##                       5. M00005                         1 ( 0.0%)                                                     
##                       6. M00006                         1 ( 0.0%)                                                     
##                       7. M00007                         1 ( 0.0%)                                                     
##                       8. M00008                         1 ( 0.0%)                                                     
##                       9. M00009                         1 ( 0.0%)                                                     
##                       10. M00010                        1 ( 0.0%)                                                     
##                       [ 2641 others ]                2641 (99.6%)           IIIIIIIIIIIIIIIIIII                       
## 
## 2    ali_1            Mean (sd) : 4.3 (0.8)          1 :   27 ( 1.0%)                             2633       18       
##      [numeric]        min < med < max:               2 :   64 ( 2.4%)                             (99.3%)    (0.7%)   
##                       1 < 4 < 5                      3 :  208 ( 7.9%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 : 1124 (42.7%)       IIIIIIII                                  
##                                                      5 : 1210 (46.0%)       IIIIIIIII                                 
## 
## 3    ali_2            Mean (sd) : 4.2 (0.9)          1 :   44 ( 1.7%)                             2627       24       
##      [numeric]        min < med < max:               2 :   86 ( 3.3%)                             (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  310 (11.8%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1088 (41.4%)       IIIIIIII                                  
##                                                      5 : 1099 (41.8%)       IIIIIIII                                  
## 
## 4    ali_3            Mean (sd) : 3.8 (1.1)          1 :  101 ( 3.8%)                             2627       24       
##      [numeric]        min < med < max:               2 :  232 ( 8.8%)       I                     (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  496 (18.9%)       III                                       
##                       IQR (CV) : 2 (0.3)             4 : 1086 (41.3%)       IIIIIIII                                  
##                                                      5 :  712 (27.1%)       IIIII                                     
## 
## 5    col_1            Mean (sd) : 4.1 (0.9)          1 :   33 ( 1.3%)                             2638       13       
##      [numeric]        min < med < max:               2 :  114 ( 4.3%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  318 (12.1%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1135 (43.0%)       IIIIIIII                                  
##                                                      5 : 1038 (39.3%)       IIIIIII                                   
## 
## 6    col_2            Mean (sd) : 3.8 (0.9)          1 :  43 ( 1.7%)                              2574       77       
##      [numeric]        min < med < max:               2 : 130 ( 5.1%)        I                     (97.1%)    (2.9%)   
##                       1 < 4 < 5                      3 : 853 (33.1%)        IIIIII                                    
##                       IQR (CV) : 2 (0.2)             4 : 900 (35.0%)        IIIIII                                    
##                                                      5 : 648 (25.2%)        IIIII                                     
## 
## 7    col_3            Mean (sd) : 4.1 (1)            1 :   55 ( 2.1%)                             2626       25       
##      [numeric]        min < med < max:               2 :  128 ( 4.9%)                             (99.1%)    (0.9%)   
##                       1 < 4 < 5                      3 :  327 (12.5%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1037 (39.5%)       IIIIIII                                   
##                                                      5 : 1079 (41.1%)       IIIIIIII                                  
## 
## 8    eng_1            Mean (sd) : 4.5 (0.7)          1 :    5 ( 0.2%)                             2645       6        
##      [numeric]        min < med < max:               2 :   23 ( 0.9%)                             (99.8%)    (0.2%)   
##                       1 < 5 < 5                      3 :  181 ( 6.8%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  869 (32.9%)       IIIIII                                    
##                                                      5 : 1567 (59.2%)       IIIIIIIIIII                               
## 
## 9    eng_2            Mean (sd) : 4.3 (0.9)          1 :   48 ( 1.8%)                             2646       5        
##      [numeric]        min < med < max:               2 :  104 ( 3.9%)                             (99.8%)    (0.2%)   
##                       1 < 5 < 5                      3 :  243 ( 9.2%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  827 (31.3%)       IIIIII                                    
##                                                      5 : 1424 (53.8%)       IIIIIIIIII                                
## 
## 10   eng_3            Mean (sd) : 4.2 (0.9)          1 :   51 ( 1.9%)                             2639       12       
##      [numeric]        min < med < max:               2 :   95 ( 3.6%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  379 (14.4%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  995 (37.7%)       IIIIIII                                   
##                                                      5 : 1119 (42.4%)       IIIIIIII                                  
## 
## 11   eng_4            Mean (sd) : 3.9 (1)            1 :  60 ( 2.3%)                              2643       8        
##      [numeric]        min < med < max:               2 : 249 ( 9.4%)        I                     (99.7%)    (0.3%)   
##                       1 < 4 < 5                      3 : 523 (19.8%)        III                                       
##                       IQR (CV) : 2 (0.3)             4 : 950 (35.9%)        IIIIIII                                   
##                                                      5 : 861 (32.6%)        IIIIII                                    
## 
## 12   eng_5            Mean (sd) : 4 (1)              1 :   55 ( 2.1%)                             2643       8        
##      [numeric]        min < med < max:               2 :  163 ( 6.2%)       I                     (99.7%)    (0.3%)   
##                       1 < 4 < 5                      3 :  516 (19.5%)       III                                       
##                       IQR (CV) : 2 (0.3)             4 :  896 (33.9%)       IIIIII                                    
##                                                      5 : 1013 (38.3%)       IIIIIII                                   
## 
## 13   inc_1            Mean (sd) : 4.4 (0.8)          1 :   26 ( 1.0%)                             2632       19       
##      [numeric]        min < med < max:               2 :   52 ( 2.0%)                             (99.3%)    (0.7%)   
##                       1 < 5 < 5                      3 :  196 ( 7.4%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 :  828 (31.5%)       IIIIII                                    
##                                                      5 : 1530 (58.1%)       IIIIIIIIIII                               
## 
## 14   inc_2            Mean (sd) : 4 (0.9)            1 :  30 ( 1.2%)                              2572       79       
##      [numeric]        min < med < max:               2 : 103 ( 4.0%)                              (97.0%)    (3.0%)   
##                       1 < 4 < 5                      3 : 575 (22.4%)        IIII                                      
##                       IQR (CV) : 2 (0.2)             4 : 908 (35.3%)        IIIIIII                                   
##                                                      5 : 956 (37.2%)        IIIIIII                                   
## 
## 15   inc_3            Mean (sd) : 4.2 (0.9)          1 :   28 ( 1.1%)                             2601       50       
##      [numeric]        min < med < max:               2 :   91 ( 3.5%)                             (98.1%)    (1.9%)   
##                       1 < 4 < 5                      3 :  370 (14.2%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  916 (35.2%)       IIIIIII                                   
##                                                      5 : 1196 (46.0%)       IIIIIIIII                                 
## 
## 16   inc_4            Mean (sd) : 4 (0.9)            1 :  33 ( 1.3%)                              2587       64       
##      [numeric]        min < med < max:               2 : 110 ( 4.3%)                              (97.6%)    (2.4%)   
##                       1 < 4 < 5                      3 : 634 (24.5%)        IIII                                      
##                       IQR (CV) : 2 (0.2)             4 : 895 (34.6%)        IIIIII                                    
##                                                      5 : 915 (35.4%)        IIIIIII                                   
## 
## 17   inc_5            Mean (sd) : 4.1 (1)            1 :   71 ( 2.7%)                             2604       47       
##      [numeric]        min < med < max:               2 :  125 ( 4.8%)                             (98.2%)    (1.8%)   
##                       1 < 4 < 5                      3 :  390 (15.0%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1018 (39.1%)       IIIIIII                                   
##                                                      5 : 1000 (38.4%)       IIIIIII                                   
## 
## 18   lea_1            Mean (sd) : 4.2 (0.8)          1 :   24 ( 0.9%)                             2631       20       
##      [numeric]        min < med < max:               2 :   79 ( 3.0%)                             (99.2%)    (0.8%)   
##                       1 < 4 < 5                      3 :  335 (12.7%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 : 1074 (40.8%)       IIIIIIII                                  
##                                                      5 : 1119 (42.5%)       IIIIIIII                                  
## 
## 19   lea_2            Mean (sd) : 4.3 (0.9)          1 :   35 ( 1.3%)                             2632       19       
##      [numeric]        min < med < max:               2 :   75 ( 2.8%)                             (99.3%)    (0.7%)   
##                       1 < 4 < 5                      3 :  287 (10.9%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  979 (37.2%)       IIIIIII                                   
##                                                      5 : 1256 (47.7%)       IIIIIIIII                                 
## 
## 20   lea_3            Mean (sd) : 4.3 (0.8)          1 :   22 ( 0.8%)                             2637       14       
##      [numeric]        min < med < max:               2 :   58 ( 2.2%)                             (99.5%)    (0.5%)   
##                       1 < 4 < 5                      3 :  197 ( 7.5%)       I                                         
##                       IQR (CV) : 1 (0.2)             4 : 1147 (43.5%)       IIIIIIII                                  
##                                                      5 : 1213 (46.0%)       IIIIIIIII                                 
## 
## 21   lea_4            Mean (sd) : 4.4 (0.8)          1 :   15 ( 0.6%)                             2634       17       
##      [numeric]        min < med < max:               2 :   41 ( 1.6%)                             (99.4%)    (0.6%)   
##                       1 < 5 < 5                      3 :  267 (10.1%)       II                                        
##                       IQR (CV) : 1 (0.2)             4 :  930 (35.3%)       IIIIIII                                   
##                                                      5 : 1381 (52.4%)       IIIIIIIIII                                
## 
## 22   age              1. 18-24                        149 ( 5.6%)           I                     2649       2        
##      [character]      2. 25-34                       1269 (47.9%)           IIIIIIIII             (99.9%)    (0.1%)   
##                       3. 35-44                        797 (30.1%)           IIIIII                                    
##                       4. 45-54                        302 (11.4%)           II                                        
##                       5. 55-64                         78 ( 2.9%)                                                     
##                       6. 65+                           54 ( 2.0%)                                                     
## 
## 23   hiredate         min : 2004-12-24               1436 distinct values                     :   2651       0        
##      [Date]           med : 2022-06-30                                                        :   (100.0%)   (0.0%)   
##                       max : 2024-02-29                                                        :                       
##                       range : 19y 2m 5d                                                     : :                       
##                                                                                       . . : : :                       
## 
## 24   race             1. American Indian/Alaskan N      2 ( 0.1%)                                 1584       1067     
##      [character]      2. Asian                        359 (22.7%)           IIII                  (59.8%)    (40.2%)  
##                       3. Black or African American     48 ( 3.0%)                                                     
##                       4. Hispanic or Latino            80 ( 5.1%)           I                                         
##                       5. Native Hawaiian or Other       2 ( 0.1%)                                                     
##                       6. Two or More Races             43 ( 2.7%)                                                     
##                       7. White                       1050 (66.3%)           IIIIIIIIIIIII                             
## 
## 25   gender           1. Female                       820 (30.9%)           IIIIII                2650       1        
##      [character]      2. Male                        1830 (69.1%)           IIIIIIIIIIIII         (100.0%)   (0.0%)   
## 
## 26   manager_status   1. Manager                      654 (24.7%)           IIII                  2651       0        
##      [character]      2. Non-Manager                 1997 (75.3%)           IIIIIIIIIIIIIII       (100.0%)   (0.0%)   
## 
## 27   country          1. Australia                    101 ( 3.8%)                                 2651       0        
##      [character]      2. Canada                        84 ( 3.2%)                                 (100.0%)   (0.0%)   
##                       3. China                         60 ( 2.3%)                                                     
##                       4. Denmark                       24 ( 0.9%)                                                     
##                       5. France                       136 ( 5.1%)           I                                         
##                       6. Germany                       48 ( 1.8%)                                                     
##                       7. India                        288 (10.9%)           II                                        
##                       8. United Kingdom               359 (13.5%)           II                                        
##                       9. United States               1551 (58.5%)           IIIIIIIIIII                               
## 
## 28   hireyear         1. 2023                        954 (36.0%)            IIIIIII               2651       0        
##      [character]      2. 2022                        632 (23.8%)            IIII                  (100.0%)   (0.0%)   
##                       3. 2021                        308 (11.6%)            II                                        
##                       4. 2020                        307 (11.6%)            II                                        
##                       5. 2019                        129 ( 4.9%)                                                      
##                       6. 2018                        125 ( 4.7%)                                                      
##                       7. 2017                         42 ( 1.6%)                                                      
##                       8. 2014                         41 ( 1.5%)                                                      
##                       9. 2015                         40 ( 1.5%)                                                      
##                       10. 2024                        38 ( 1.4%)                                                      
##                       [ 5 others ]                    35 ( 1.3%)                                                      
## 
## 29   tenure_group     1. 1-2 years                   670 (25.3%)            IIIII                 2651       0        
##      [character]      2. 10+ years                     4 ( 0.2%)                                  (100.0%)   (0.0%)   
##                       3. 2-4 years                   644 (24.3%)            IIII                                      
##                       4. 3-6 months                  325 (12.3%)            II                                        
##                       5. 4-6 years                   274 (10.3%)            II                                        
##                       6. 6-10 years                  163 ( 6.1%)            I                                         
##                       7. 6-12 months                 507 (19.1%)            III                                       
##                       8. Under 3 months               64 ( 2.4%)                                                      
## ----------------------------------------------------------------------------------------------------------------------
favorability_ali_1 <- sum(df_2$ali_1 %in% c(4,5))/sum(df_2$ali_1 %in% c(1:5))
favorability_ali_1
## [1] 0.8864413
#let's create a function that automatates the process of calculating favorability scores for all questions at once:

function_favorability <- function(df) {
  favorability_score <- numeric(ncol(df))  # Include all columns
  column_names <- names(df)  # Get the column names
  
  for (i in 1:ncol(df)) {  # Start from column 1
    # Calculate the proportion
    favorability_score[i] <- sum(df[[i]] %in% c(4,5)) / sum(df[[i]] %in% c(1:5))
  }
  
  # Combine favorability scores and column names into a data frame
  result <- data.frame(column_name = column_names, favorability_score = favorability_score)
  
  return(result)
}

# Calculate favorability_score for columns 2 to 21
favorability_score <- function_favorability(df_2[, 2:21])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8864413
## 2            ali_2          0.8325086
## 3            ali_3          0.6844309
## 4            col_1          0.8237301
## 5            col_2          0.6013986
## 6            col_3          0.8057883
## 7            eng_1          0.9209830
## 8            eng_2          0.8507181
## 9            eng_3          0.8010610
## 10           eng_4          0.6852062
## 11           eng_5          0.7222853
## 12           inc_1          0.8958967
## 13           inc_2          0.7247278
## 14           inc_3          0.8119954
## 15           inc_4          0.6996521
## 16           inc_5          0.7749616
## 17           lea_1          0.8335234
## 18           lea_2          0.8491641
## 19           lea_3          0.8949564
## 20           lea_4          0.8773728
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  
print(favorability_score_percent)
##    question_number favorability_score
## 1            ali_1                 89
## 2            ali_2                 83
## 3            ali_3                 68
## 4            col_1                 82
## 5            col_2                 60
## 6            col_3                 81
## 7            eng_1                 92
## 8            eng_2                 85
## 9            eng_3                 80
## 10           eng_4                 69
## 11           eng_5                 72
## 12           inc_1                 90
## 13           inc_2                 72
## 14           inc_3                 81
## 15           inc_4                 70
## 16           inc_5                 77
## 17           lea_1                 83
## 18           lea_2                 85
## 19           lea_3                 89
## 20           lea_4                 88
#graph favorable score for each question
##setting different colors for different factors

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")

##

group_colors <- c(
  "Alignment" = "#D08770",  # Light orange for Group 1
  "Collaboration" = "#A3BE8C",  # Light green for Group 2
  "Engagement" = "#5E81AC",  # Light blue for Group 3
  "Inclusion" = "#EBCB8B",  # Light yellow for Group 4
  "Leadership" = "#B48EAD"   # Mild purple for Group 5
)

## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

- b. Calculate a ‘Factor Favorable Score’, which is the average favorable score among questions within a factor, for each factor.

#let's create a function that automates the process of calculating of factor favorable score.

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8011269
## 2                 col              0.7436390
## 3                 eng              0.7960507
## 4                 inc              0.7814467
## 5                 lea              0.8637542
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 68     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 82 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 81 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 85    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 78     Inclusion
## 12           inc_1                 90     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 83    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 88    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's enagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1         Overall          0.7960507
#create a graph that shows enagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 80 Engagement
## 7            eng_1                 92 Engagement
## 8            eng_2                 85 Engagement
## 9            eng_3                 80 Engagement
## 10           eng_4                 69 Engagement
## 11           eng_5                 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # Purple for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

* 2. Calculate engagement factor (favorable) scores by all tenure groups (need to clarify this question..) - a. engagement factor favorable scores for employees whose tenure are “Under 3 Months”. n = 64

# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$tenure_group == "Under 3 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1           0.984375
## 2           eng_2           0.859375
## 3           eng_3           0.906250
## 4           eng_4           0.890625
## 5           eng_5           0.828125
# Calculate the average engagement factor favorable score for those under 3 months
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1           0.984375
## 2           eng_2           0.859375
## 3           eng_3           0.906250
## 4           eng_4           0.890625
## 5           eng_5           0.828125
## 6         average           0.893750
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 98
## 2           eng_2                 86
## 3           eng_3                 91
## 4           eng_4                 89
## 5           eng_5                 83
## 6         average                 89
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # Purple for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with Under 3 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "Under 3 Months"
  • b.engagement factor favorability scores for employees whose tenure are “3-6 months”. n = 325
#3-6 months
df_2[df_2$tenure_group == "3-6 months" & !is.na(df_2$tenure_group), ]
## # A tibble: 325 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M003…     5     5     4     5     4     5     5     5     5     4     3     5
##  2 M001…     5     5     3     5     3     5     5     5     5     4     5     5
##  3 M002…     5     5     5     5     5     5     5     5     5     4     4     5
##  4 M021…     5     4     4     4     3     5     5     3     4     2     4     5
##  5 M024…     5     4     2     4     3     3     4     5     4     3     3     4
##  6 M014…     4     4     4     4     4     4     4     2     3     2     4     3
##  7 M017…     5     5     5     5     5     5     5     5     5     5     5     5
##  8 M020…     4     4     3     4     4     4     5     5     4     4     5     5
##  9 M017…     3     4     3     4     3     5     5     5     5     4     5     2
## 10 M019…     5     5     5     5     5     5     5     5     5     5     5     5
## # ℹ 315 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "3-6 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9506173
## 2           eng_2          0.9040248
## 3           eng_3          0.8796296
## 4           eng_4          0.8209877
## 5           eng_5          0.8580247
# Calculate the average engagement factor favorable score for those with 3-6 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)

average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9506173
## 2           eng_2          0.9040248
## 3           eng_3          0.8796296
## 4           eng_4          0.8209877
## 5           eng_5          0.8580247
## 6         average          0.8826568
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 95
## 2           eng_2                 90
## 3           eng_3                 88
## 4           eng_4                 82
## 5           eng_5                 86
## 6         average                 88
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# graph showing engagement factor scores for the 3-6 months tenure group

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 3-6 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "3-6 Months"
- c.engagement factor favorability scores for employees whose tenure are "6-12 months".
  n = 507
#6-12 months
df_2[df_2$tenure_group == "6-12 months" & !is.na(df_2$tenure_group), ]
## # A tibble: 507 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M012…     4     3     4     3     3     3     4     3     4     3     3     4
##  2 M025…     5     5     4     4     5     5     5     4     5     4     5     5
##  3 M007…     4     3     3     5     3     4     4     4     3     3     3     4
##  4 M020…     5     5     5     5     5     5     5     5     4     5     5     5
##  5 M011…     4     5     4     4     4     5     5     5     5     5     5     4
##  6 M011…     5     4     4     5     2     5     5     4     5     2     3     5
##  7 M007…     4     4     4     4     4     4     5     4     4     5     4     5
##  8 M024…     2     4     3     3     4     4     3     1     2     3     2     3
##  9 M012…     4     2     2     4     5     3     4     3     4     3     3     5
## 10 M019…     5     5     5     5     5     5     5     5     5     5     5     5
## # ℹ 497 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-12 months", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9090909
## 2           eng_2          0.8106509
## 3           eng_3          0.8134921
## 4           eng_4          0.7199211
## 5           eng_5          0.7312253
# Calculate the average engagement factor favorable score for those with 6-12 months tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9090909
## 2           eng_2          0.8106509
## 3           eng_3          0.8134921
## 4           eng_4          0.7199211
## 5           eng_5          0.7312253
## 6         average          0.7968761
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 91
## 2           eng_2                 81
## 3           eng_3                 81
## 4           eng_4                 72
## 5           eng_5                 73
## 6         average                 80
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

# graph showing engagement factor scores for the 6-12 months tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-12 Months Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-12 Months"
- d. engagement factor favorability scores for employees whose tenure are "1-2 years".
  N = 670
#1-2 years
df_2[df_2$tenure_group == "1-2 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 670 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M006…     4     4     4     4     4     4     4     4     4     3     3     4
##  2 M017…     4     4     3     4     4     4     4     4     3     4     4     4
##  3 M014…     5     5     4     5     5     5     5     5     4     3     3     5
##  4 M013…     4     5     4     5     4     5     5     5     5     4     5     5
##  5 M000…     5     4     4     3     5     3     5     5     5     5     5     5
##  6 M020…     5     4     3     5     3     4     5     5     3     4     4     5
##  7 M016…     3     3     2     4     3     4     4     4     4     3     3     3
##  8 M002…     4     4     5     4     3     5     5     5     4     3     4     5
##  9 M017…     4     4     3     4     4     4     4     4     5     4     4     4
## 10 M012…     5     5     4     5     4     4     4     4     4     5     5     4
## # ℹ 660 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "1-2 years", 8:12])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9235382
## 2           eng_2          0.8579970
## 3           eng_3          0.7976012
## 4           eng_4          0.6616766
## 5           eng_5          0.7140719
# Calculate the average engagement factor favorable score for those with 1-2 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9235382
## 2           eng_2          0.8579970
## 3           eng_3          0.7976012
## 4           eng_4          0.6616766
## 5           eng_5          0.7140719
## 6         average          0.7909770
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 92
## 2           eng_2                 86
## 3           eng_3                 80
## 4           eng_4                 66
## 5           eng_5                 71
## 6         average                 79
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 1-2 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 1-2 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "1-2 Years"
- e. engagement factor favorability scores for employees whose tenure are "2-4 years".
  n = 644
#2-4 years
df_2[df_2$tenure_group == "2-4 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 644 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M010…     5     5     5     5     4     5     5     5     4     5     3     5
##  2 M017…    NA    NA    NA    NA    NA    NA     4    NA    NA    NA    NA    NA
##  3 M004…     4     5     5     4     5     5     5     3     4     4     4     4
##  4 M014…     4     4     2     3     2     3     4     4     3     2     3     5
##  5 M009…     4     5     4     4     5     5     4     1     1     3     4     5
##  6 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  7 M020…     5     5     5     5     4     5     5     5     5     5     5     5
##  8 M024…     5     5     4     3     5     5     5     5     3     2     3     5
##  9 M021…     3     5     4     2     5     5     3     2     3     2     3     3
## 10 M022…     4     2     4     4     2     1     3     3     5     3     3     4
## # ℹ 634 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "2-4 years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9144635
## 2           eng_2          0.8429238
## 3           eng_3          0.7659906
## 4           eng_4          0.6209048
## 5           eng_5          0.6931464
# Calculate the average engagement factor favorable score for those with 2-4 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9144635
## 2           eng_2          0.8429238
## 3           eng_3          0.7659906
## 4           eng_4          0.6209048
## 5           eng_5          0.6931464
## 6         average          0.7674858
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 91
## 2           eng_2                 84
## 3           eng_3                 77
## 4           eng_4                 62
## 5           eng_5                 69
## 6         average                 77
#visualize the favorability scores




# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 2-4 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 2-4 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "2-4 Years"
- f. engagement factor scores for employees whose tenure are "4-6 years".
  N = 274
#4-6 years
df_2[df_2$tenure_group == "4-6 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 274 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  2 M001…     5     5     4     5     5     5     5     5     5     4     3     5
##  3 M002…     5     5     4     5     4     4     5     5     5     5     5     5
##  4 M018…     4     4     4     2     5     5     4     1     3     4     3     4
##  5 M013…     5     5     4     4    NA     5     5     5     5     4     5     5
##  6 M006…     5     5     2     5     3     4     5     5     5     4     2     4
##  7 M009…     5     5     3     5     5     5     5     5     5     5     5     5
##  8 M005…     4     4     4     4     3     3     4     4     4     4     4     4
##  9 M023…     5     5     5     4     5     5     5     4     5     3     3     5
## 10 M009…     5     5     4     5     5     5     5     5     5     5     5     5
## # ℹ 264 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "4-6 years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9014599
## 2           eng_2          0.8534799
## 3           eng_3          0.7867647
## 4           eng_4          0.6300366
## 5           eng_5          0.6167883
# Calculate the average engagement factor favorable score for those with 4-6 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9014599
## 2           eng_2          0.8534799
## 3           eng_3          0.7867647
## 4           eng_4          0.6300366
## 5           eng_5          0.6167883
## 6         average          0.7577059
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 90
## 2           eng_2                 85
## 3           eng_3                 79
## 4           eng_4                 63
## 5           eng_5                 62
## 6         average                 76
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 4-6 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 4-6 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "4-6 Years"
 - g. engagement factor scores for employees whose tenure are "6-10 years".
   N = 163
#6-10 years
df_2[df_2$tenure_group == "6-10 years" & !is.na(df_2$tenure_group), ]
## # A tibble: 163 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…     4     4     4     4     4     3     4     4     4     4     4     4
##  2 M016…     5     5     5     5     4     5     5     5     4     3     4     5
##  3 M021…     5     5     4     5     4     5     5     5     4     5     5     5
##  4 M025…     4     4     4     4     4     4     5     5     3     3     4     4
##  5 M013…     4     3     3     3     3     3     4     3     3     2     2     4
##  6 M022…     3     3     4     5     3     3     4     3     3     2     2     4
##  7 M016…     5     5     4     5     5     5     5     5     5     5     5     5
##  8 M016…     4     4     3     4     3     4     4     4     3     3     3     4
##  9 M011…     4    NA    NA     4     4    NA     4     4     4     4     3     4
## 10 M011…     5     5     5     4     3     3     4     5     4     4     4     4
## # ℹ 153 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "6-10 years", 8:12])


colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9202454
## 2           eng_2          0.8588957
## 3           eng_3          0.7361963
## 4           eng_4          0.6604938
## 5           eng_5          0.7080745
# Calculate the average engagement factor favorable score for those with 6-10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1          0.9202454
## 2           eng_2          0.8588957
## 3           eng_3          0.7361963
## 4           eng_4          0.6604938
## 5           eng_5          0.7080745
## 6         average          0.7767812
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                 92
## 2           eng_2                 86
## 3           eng_3                 74
## 4           eng_4                 66
## 5           eng_5                 71
## 6         average                 78
# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")


# graph showing engagement factor scores for the 6-10 years tenure group
ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Employees with 6-10 years Tenure") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

#append this group's engagement factor favorability score to the tenure group df
engagement_by_tenure <- rbind(engagement_by_tenure, average_row)
engagement_by_tenure$question_number[engagement_by_tenure$question_number == "average"] <- "6-10 Years"
    1. engagement factor scores for employees whose tenure are “10+ years”. N = 4, given that this category has less than 5 employees, we don’t report it.
#it is critical that there are only 2 employees who have worked longer than 10 years.

#10+ years
df_2[df_2$tenure_group == "10+ years" & !is.na(df_2$tenure_group), ]
## # A tibble: 4 × 29
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01243     4     4     4     4     4     4     5     5     5     4     4     5
## 2 M01936     5     5     3     5     5     5     5     5     5     4     5     5
## 3 M01361     5     5     4     5     4     3     5     5     5     5     5     5
## 4 M02343     5     5     4     5     5     5     5     5     5     4     3     3
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 8 to 12 (engagement questions)
favorability_score <- function_favorability(df_2[df_2$tenure_group == "10+ years", 8:12])

colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1               1.00
## 2           eng_2               1.00
## 3           eng_3               1.00
## 4           eng_4               1.00
## 5           eng_5               0.75
# Calculate the average engagement factor favorable score for those with over 10 years tenure
average_score <- mean(favorability_score$favorability_score, na.rm = TRUE)


average_row <- data.frame(
  question_number = "average",  
  favorability_score = average_score
)

favorability_score_with_average <- rbind(favorability_score, average_row)
print(favorability_score_with_average)
##   question_number favorability_score
## 1           eng_1               1.00
## 2           eng_2               1.00
## 3           eng_3               1.00
## 4           eng_4               1.00
## 5           eng_5               0.75
## 6         average               0.95
favorability_score_with_average_percent <- favorability_score_with_average
favorability_score_with_average_percent$favorability_score <- round(favorability_score_with_average_percent $favorability_score,2)*100

print(favorability_score_with_average_percent)
##   question_number favorability_score
## 1           eng_1                100
## 2           eng_2                100
## 3           eng_3                100
## 4           eng_4                100
## 5           eng_5                 75
## 6         average                 95
#visualize the favorability scores


# Distinguish the average row from other item rows
favorability_score_with_average_percent$highlight <- ifelse(favorability_score_with_average_percent$question_number == favorability_score_with_average_percent$question_number[6], "first", "other")

ggplot(favorability_score_with_average_percent, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Favorability Scores for Employees with Over 10 Years Tenure") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 0, hjust = 1), 
    plot.title = element_text(hjust = 0.5)  
  ) +
   geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5)

- i. Engagement factor favorable scores across tenure groups (employee life cycle)
engagement_by_tenure$favorability_score <- round(engagement_by_tenure$favorability_score*100)
engagement_by_tenure$question_number <- factor(engagement_by_tenure$question_number, levels = engagement_by_tenure$question_number)

engagement_by_tenure
##   question_number favorability_score
## 1         Overall                 80
## 2  Under 3 Months                 89
## 3      3-6 Months                 88
## 4     6-12 Months                 80
## 5       1-2 Years                 79
## 6       2-4 Years                 77
## 7       4-6 Years                 76
## 8      6-10 Years                 78
# Distinguish the average row from other item rows
engagement_by_tenure$highlight <- ifelse(engagement_by_tenure$question_number == engagement_by_tenure$question_number[1], "first", "other")


# graph showing engagement factor scores across tenure groups 
ggplot(engagement_by_tenure, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores Across Tenure Groups") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 11),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. Calculate favorable scores for all questions and factors by gender
      1. favorable scores for all questions and factors for women
      • N for female = 820
table(df_2$gender)
## 
## Female   Male 
##    820   1830
df_2[df_2$gender== "Female" & !is.na(df_2$gender), ]
## # A tibble: 820 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…     4     4     3     4     4     4     4     4     3     4     4     4
##  2 M010…     5     5     5     5     4     5     5     5     4     5     3     5
##  3 M001…     5     5     3     5     3     5     5     5     5     4     5     5
##  4 M011…     4     5     4     4     4     5     5     5     5     5     5     4
##  5 M011…     5     4     4     5     2     5     5     4     5     2     3     5
##  6 M002…     4     4     5     4     3     5     5     5     4     3     4     5
##  7 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  8 M007…     4     1     1     3     2     5     4     2     3     2     2     2
##  9 M024…     5     5     4     3     5     5     5     5     3     2     3     5
## 10 M007…     4     4     4     4     4     4     5     4     4     5     4     5
## # ℹ 810 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Female", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8441718
## 2            ali_2          0.7980296
## 3            ali_3          0.6584464
## 4            col_1          0.7610294
## 5            col_2          0.6022727
## 6            col_3          0.8204182
## 7            eng_1          0.8974359
## 8            eng_2          0.8095238
## 9            eng_3          0.7533742
## 10           eng_4          0.6328029
## 11           eng_5          0.6813725
## 12           inc_1          0.8620269
## 13           inc_2          0.6492537
## 14           inc_3          0.7382134
## 15           inc_4          0.6509317
## 16           inc_5          0.7719950
## 17           lea_1          0.8019680
## 18           lea_2          0.8009828
## 19           lea_3          0.8946078
## 20           lea_4          0.8378378
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 84     Alignment
## 2            ali_2                 80     Alignment
## 3            ali_3                 66     Alignment
## 4            col_1                 76 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 82 Collaboration
## 7            eng_1                 90    Engagement
## 8            eng_2                 81    Engagement
## 9            eng_3                 75    Engagement
## 10           eng_4                 63    Engagement
## 11           eng_5                 68    Engagement
## 12           inc_1                 86     Inclusion
## 13           inc_2                 65     Inclusion
## 14           inc_3                 74     Inclusion
## 15           inc_4                 65     Inclusion
## 16           inc_5                 77     Inclusion
## 17           lea_1                 80    Leadership
## 18           lea_2                 80    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 84    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7668826
## 2                 col              0.7279068
## 3                 eng              0.7549019
## 4                 inc              0.7344841
## 5                 lea              0.8338491
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")


#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 84     Alignment
## 2            ali_2                 80     Alignment
## 3            ali_3                 66     Alignment
## 22             col                 73 Collaboration
## 4            col_1                 76 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 82 Collaboration
## 23             eng                 75    Engagement
## 7            eng_1                 90    Engagement
## 8            eng_2                 81    Engagement
## 9            eng_3                 75    Engagement
## 10           eng_4                 63    Engagement
## 11           eng_5                 68    Engagement
## 24             inc                 73     Inclusion
## 12           inc_1                 86     Inclusion
## 13           inc_2                 65     Inclusion
## 14           inc_3                 74     Inclusion
## 15           inc_4                 65     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 83    Leadership
## 17           lea_1                 80    Leadership
## 18           lea_2                 80    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 84    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1         Overall          0.7549019
#create a graph that shows engagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 75 Engagement
## 7            eng_1                 90 Engagement
## 8            eng_2                 81 Engagement
## 9            eng_3                 75 Engagement
## 10           eng_4                 63 Engagement
## 11           eng_5                 68 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

#Create a ggplot that shows engagement factor favorable scores for women

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

- b. favorable scores for all questions and factors for men
    N = 1830
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$gender == "Male", 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9053385
## 2            ali_2          0.8484013
## 3            ali_3          0.6958678
## 4            col_1          0.8517298
## 5            col_2          0.6007861
## 6            col_3          0.7991170
## 7            eng_1          0.9315068
## 8            eng_2          0.8691128
## 9            eng_3          0.8222710
## 10           eng_4          0.7090411
## 11           eng_5          0.7404162
## 12           inc_1          0.9111479
## 13           inc_2          0.7589134
## 14           inc_3          0.8450390
## 15           inc_4          0.7216611
## 16           inc_5          0.7761693
## 17           lea_1          0.8475509
## 18           lea_2          0.8706659
## 19           lea_3          0.8950549
## 20           lea_4          0.8949973
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  
print(favorability_score_percent)
##    question_number favorability_score
## 1            ali_1                 91
## 2            ali_2                 85
## 3            ali_3                 70
## 4            col_1                 85
## 5            col_2                 60
## 6            col_3                 80
## 7            eng_1                 93
## 8            eng_2                 87
## 9            eng_3                 82
## 10           eng_4                 71
## 11           eng_5                 74
## 12           inc_1                 91
## 13           inc_2                 76
## 14           inc_3                 85
## 15           inc_4                 72
## 16           inc_5                 78
## 17           lea_1                 85
## 18           lea_2                 87
## 19           lea_3                 90
## 20           lea_4                 89
library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 70     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 80 Collaboration
## 7            eng_1                 93    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 74    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 72     Inclusion
## 16           inc_5                 78     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 90    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8165359
## 2                 col              0.7505443
## 3                 eng              0.8144696
## 4                 inc              0.8025861
## 5                 lea              0.8770673
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")


#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 82     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 70     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 60 Collaboration
## 6            col_3                 80 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 93    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 74    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 72     Inclusion
## 16           inc_5                 78     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 90    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores 

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Overall",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1         Overall          0.8144696
#create a graph that shows engagement favorable score across tenure groups (all)

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 81 Engagement
## 7            eng_1                 93 Engagement
## 8            eng_2                 87 Engagement
## 9            eng_3                 82 Engagement
## 10           eng_4                 71 Engagement
## 11           eng_5                 74 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

# create a graph that shows engagement factor favorable scores for men 
ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women for Under 3 months N = 25
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "Under 3 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 25 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M016…     5     5     4     4     3     4     5     5     5     4     5     4
##  2 M017…     4     5     4     4     4     5     5     5     5     5     5     4
##  3 M016…     4     4     3     4     4     4     4     3     4     4     4     3
##  4 M009…     4     4     4     5     5     5     4     4     4     4     4     5
##  5 M023…     3     5     5     4     4     4     5     5     5     5     3     4
##  6 M000…     3     5     5     5     3     5     5     5     5     5     3     5
##  7 M020…     3     5     5     5     5     5     5     5     5     5     5     5
##  8 M021…     4     5     5     5    NA     5     5     5     4     5     5     5
##  9 M022…     4     5     3     4     3     5     5     5     4     5     5     5
## 10 M006…     5     5     4     4    NA     2     5     5     5     5     5     5
## # ℹ 15 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.6000000
## 2            ali_2          0.8800000
## 3            ali_3          0.6800000
## 4            col_1          0.8800000
## 5            col_2          0.6500000
## 6            col_3          0.7916667
## 7            eng_1          0.9600000
## 8            eng_2          0.8000000
## 9            eng_3          0.9200000
## 10           eng_4          0.9600000
## 11           eng_5          0.8000000
## 12           inc_1          0.9200000
## 13           inc_2          0.7826087
## 14           inc_3          0.9200000
## 15           inc_4          0.7600000
## 16           inc_5          0.9200000
## 17           lea_1          0.9600000
## 18           lea_2          0.9200000
## 19           lea_3          0.9600000
## 20           lea_4          0.9600000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 60     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 68     Alignment
## 4            col_1                 88 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 79 Collaboration
## 7            eng_1                 96    Engagement
## 8            eng_2                 80    Engagement
## 9            eng_3                 92    Engagement
## 10           eng_4                 96    Engagement
## 11           eng_5                 80    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 78     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 76     Inclusion
## 16           inc_5                 92     Inclusion
## 17           lea_1                 96    Leadership
## 18           lea_2                 92    Leadership
## 19           lea_3                 96    Leadership
## 20           lea_4                 96    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7200000
## 2                 col              0.7738889
## 3                 eng              0.8880000
## 4                 inc              0.8605217
## 5                 lea              0.9500000
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 72     Alignment
## 1            ali_1                 60     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 68     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 88 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 79 Collaboration
## 23             eng                 89    Engagement
## 7            eng_1                 96    Engagement
## 8            eng_2                 80    Engagement
## 9            eng_3                 92    Engagement
## 10           eng_4                 96    Engagement
## 11           eng_5                 80    Engagement
## 24             inc                 86     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 78     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 76     Inclusion
## 16           inc_5                 92     Inclusion
## 25             lea                 95    Leadership
## 17           lea_1                 96    Leadership
## 18           lea_2                 92    Leadership
## 19           lea_3                 96    Leadership
## 20           lea_4                 96    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Under 3 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1  Under 3 months              0.888
#create a graph that shows engagement favorable scores for women with tenure under 3 months  

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 89 Engagement
## 7            eng_1                 96 Engagement
## 8            eng_2                 80 Engagement
## 9            eng_3                 92 Engagement
## 10           eng_4                 96 Engagement
## 11           eng_5                 80 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure Under 3 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women with tenure between 3-6 months N = 116
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "3-6 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 116 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M001…     5     5     3     5     3     5     5     5     5     4     5     5
##  2 M017…     3     4     3     4     3     5     5     5     5     4     5     2
##  3 M019…     5     5     5     5     5     5     5     5     5     5     5     5
##  4 M026…     4     4     4     4     4     4     5     5     5     5     4     4
##  5 M025…     5     5     4     5     5     5     5     5     5     5     5     5
##  6 M002…     4     3     4     3     4     4     5     3     4     4     3     4
##  7 M015…     3     3     3     3     3     3     4     4     4     4     4     4
##  8 M024…     4     4     4     4     4     4     4     5     4     4     5     5
##  9 M004…     5     5     5     5     5     5     5     5     5     5     5     5
## 10 M010…     4     4     4     4     5     4     4     2     3     3     3     3
## # ℹ 106 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9130435
## 2            ali_2          0.8782609
## 3            ali_3          0.7739130
## 4            col_1          0.8534483
## 5            col_2          0.5904762
## 6            col_3          0.9130435
## 7            eng_1          0.9396552
## 8            eng_2          0.9130435
## 9            eng_3          0.8448276
## 10           eng_4          0.7931034
## 11           eng_5          0.8448276
## 12           inc_1          0.8782609
## 13           inc_2          0.6725664
## 14           inc_3          0.7894737
## 15           inc_4          0.7433628
## 16           inc_5          0.7982456
## 17           lea_1          0.8347826
## 18           lea_2          0.8956522
## 19           lea_3          0.9391304
## 20           lea_4          0.8608696
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  

library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")


print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 77     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 91 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 91    Engagement
## 9            eng_3                 84    Engagement
## 10           eng_4                 79    Engagement
## 11           eng_5                 84    Engagement
## 12           inc_1                 88     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 79     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 83    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 94    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8550725
## 2                 col              0.7856560
## 3                 eng              0.8670915
## 4                 inc              0.7763819
## 5                 lea              0.8826087
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 88     Alignment
## 3            ali_3                 77     Alignment
## 22             col                 79 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 91 Collaboration
## 23             eng                 87    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 91    Engagement
## 9            eng_3                 84    Engagement
## 10           eng_4                 79    Engagement
## 11           eng_5                 84    Engagement
## 24             inc                 78     Inclusion
## 12           inc_1                 88     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 79     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 83    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 94    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "3-6 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1      3-6 months          0.8670915
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 87 Engagement
## 7            eng_1                 94 Engagement
## 8            eng_2                 91 Engagement
## 9            eng_3                 84 Engagement
## 10           eng_4                 79 Engagement
## 11           eng_5                 84 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 3-6 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

- f. favorable scores for all questions for women with tenure between 6-12 months N = 181

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-12 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 181 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M011…     4     5     4     4     4     5     5     5     5     5     5     4
##  2 M011…     5     4     4     5     2     5     5     4     5     2     3     5
##  3 M007…     4     4     4     4     4     4     5     4     4     5     4     5
##  4 M012…     4     2     2     4     5     3     4     3     4     3     3     5
##  5 M012…     3     3     2     3     3     3     3     3     1     3     3     3
##  6 M013…     4     5     5     4     3     5     4     3     3     4     3     5
##  7 M012…     5     5     5     5     4     5     5     5     5     5     5     5
##  8 M016…     5     4     4     2     2     5     3     3     2     2     3     5
##  9 M015…     4     4     3     5     3     3     4     5     4     3     3     4
## 10 M013…     4     4     4    NA     3     2     4     3     4     3     4     4
## # ℹ 171 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8166667
## 2            ali_2          0.7777778
## 3            ali_3          0.7111111
## 4            col_1          0.7262570
## 5            col_2          0.5674157
## 6            col_3          0.7734807
## 7            eng_1          0.8950276
## 8            eng_2          0.7734807
## 9            eng_3          0.7932961
## 10           eng_4          0.7071823
## 11           eng_5          0.6944444
## 12           inc_1          0.8618785
## 13           inc_2          0.6944444
## 14           inc_3          0.8100559
## 15           inc_4          0.6815642
## 16           inc_5          0.7888889
## 17           lea_1          0.7944444
## 18           lea_2          0.8333333
## 19           lea_3          0.8674033
## 20           lea_4          0.8287293
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 82     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 71     Alignment
## 4            col_1                 73 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 77 Collaboration
## 7            eng_1                 90    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 79    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 69    Engagement
## 12           inc_1                 86     Inclusion
## 13           inc_2                 69     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 83    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 83    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7685185
## 2                 col              0.6890511
## 3                 eng              0.7726862
## 4                 inc              0.7673664
## 5                 lea              0.8309776
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 82     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 71     Alignment
## 22             col                 69 Collaboration
## 4            col_1                 73 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 77 Collaboration
## 23             eng                 77    Engagement
## 7            eng_1                 90    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 79    Engagement
## 10           eng_4                 71    Engagement
## 11           eng_5                 69    Engagement
## 24             inc                 77     Inclusion
## 12           inc_1                 86     Inclusion
## 13           inc_2                 69     Inclusion
## 14           inc_3                 81     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 25             lea                 83    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 83    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 83    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-12 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1     6-12 months          0.7726862
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 77 Engagement
## 7            eng_1                 90 Engagement
## 8            eng_2                 77 Engagement
## 9            eng_3                 79 Engagement
## 10           eng_4                 71 Engagement
## 11           eng_5                 69 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6 and 12 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women with tenure between 1-2 years N = 211
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "1-2 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 211 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…     4     4     3     4     4     4     4     4     3     4     4     4
##  2 M002…     4     4     5     4     3     5     5     5     4     3     4     5
##  3 M007…     4     1     1     3     2     5     4     2     3     2     2     2
##  4 M016…     5     5     4     5     5     5     5     5     5     3     3     5
##  5 M012…     4     4     3     3     3     3     3     4     4     3     4     3
##  6 M009…     4     3     3     2     3     4     4     3     3     3     3     4
##  7 M009…     3     3     4     2     3     4     4     4     3     2     3     4
##  8 M024…     5     5     5     5     5     5     5     5     5     5     5     5
##  9 M022…     5     5     5     5     5     5     5     5     5     5     5     5
## 10 M019…     4     5     3     3     3     5     5     5     5     3     4     3
## # ℹ 201 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8666667
## 2            ali_2          0.7809524
## 3            ali_3          0.6507177
## 4            col_1          0.7772512
## 5            col_2          0.6201923
## 6            col_3          0.8373206
## 7            eng_1          0.8904762
## 8            eng_2          0.8293839
## 9            eng_3          0.7320574
## 10           eng_4          0.5971564
## 11           eng_5          0.6952381
## 12           inc_1          0.9004739
## 13           inc_2          0.6666667
## 14           inc_3          0.7464115
## 15           inc_4          0.6411483
## 16           inc_5          0.8038278
## 17           lea_1          0.8173077
## 18           lea_2          0.8104265
## 19           lea_3          0.9238095
## 20           lea_4          0.8619048
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 87     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 65     Alignment
## 4            col_1                 78 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 7            eng_1                 89    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 73    Engagement
## 10           eng_4                 60    Engagement
## 11           eng_5                 70    Engagement
## 12           inc_1                 90     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 75     Inclusion
## 15           inc_4                 64     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 82    Leadership
## 18           lea_2                 81    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7661123
## 2                 col              0.7449214
## 3                 eng              0.7488624
## 4                 inc              0.7517056
## 5                 lea              0.8533621
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 77     Alignment
## 1            ali_1                 87     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 65     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 78 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 23             eng                 75    Engagement
## 7            eng_1                 89    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 73    Engagement
## 10           eng_4                 60    Engagement
## 11           eng_5                 70    Engagement
## 24             inc                 75     Inclusion
## 12           inc_1                 90     Inclusion
## 13           inc_2                 67     Inclusion
## 14           inc_3                 75     Inclusion
## 15           inc_4                 64     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 85    Leadership
## 17           lea_1                 82    Leadership
## 18           lea_2                 81    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "1-2 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       1-2 years          0.7488624
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 75 Engagement
## 7            eng_1                 89 Engagement
## 8            eng_2                 83 Engagement
## 9            eng_3                 73 Engagement
## 10           eng_4                 60 Engagement
## 11           eng_5                 70 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 1-2 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women with tenure between 2-4 years N = 191
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "2-4 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 191 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M010…     5     5     5     5     4     5     5     5     4     5     3     5
##  2 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  3 M024…     5     5     4     3     5     5     5     5     3     2     3     5
##  4 M021…     3     5     4     2     5     5     3     2     3     2     3     3
##  5 M022…     4     2     4     4     2     1     3     3     5     3     3     4
##  6 M020…     4     4     4    NA    NA     4     3     3     2    NA     4     2
##  7 M013…     5     5     5     5     5     5     5     2     3     5     5     5
##  8 M024…     5     5     3     4     3     3     5     5     5     5     5     3
##  9 M001…     4     4     2     4     3     4     4     4     4     3     3     1
## 10 M009…     4     3     4     4     3     4     4     4     3     2     2     3
## # ℹ 181 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8473684
## 2            ali_2          0.7765957
## 3            ali_3          0.6137566
## 4            col_1          0.7421053
## 5            col_2          0.6170213
## 6            col_3          0.8315789
## 7            eng_1          0.8900524
## 8            eng_2          0.7748691
## 9            eng_3          0.6910995
## 10           eng_4          0.5238095
## 11           eng_5          0.6000000
## 12           inc_1          0.8167539
## 13           inc_2          0.5828877
## 14           inc_3          0.6256684
## 15           inc_4          0.6276596
## 16           inc_5          0.7393617
## 17           lea_1          0.7801047
## 18           lea_2          0.7315789
## 19           lea_3          0.8691099
## 20           lea_4          0.8115183
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 85     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 61     Alignment
## 4            col_1                 74 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 83 Collaboration
## 7            eng_1                 89    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 69    Engagement
## 10           eng_4                 52    Engagement
## 11           eng_5                 60    Engagement
## 12           inc_1                 82     Inclusion
## 13           inc_2                 58     Inclusion
## 14           inc_3                 63     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 74     Inclusion
## 17           lea_1                 78    Leadership
## 18           lea_2                 73    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 81    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7459069
## 2                 col              0.7302352
## 3                 eng              0.6959661
## 4                 inc              0.6784663
## 5                 lea              0.7980780
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 75     Alignment
## 1            ali_1                 85     Alignment
## 2            ali_2                 78     Alignment
## 3            ali_3                 61     Alignment
## 22             col                 73 Collaboration
## 4            col_1                 74 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 83 Collaboration
## 23             eng                 70    Engagement
## 7            eng_1                 89    Engagement
## 8            eng_2                 77    Engagement
## 9            eng_3                 69    Engagement
## 10           eng_4                 52    Engagement
## 11           eng_5                 60    Engagement
## 24             inc                 68     Inclusion
## 12           inc_1                 82     Inclusion
## 13           inc_2                 58     Inclusion
## 14           inc_3                 63     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 74     Inclusion
## 25             lea                 80    Leadership
## 17           lea_1                 78    Leadership
## 18           lea_2                 73    Leadership
## 19           lea_3                 87    Leadership
## 20           lea_4                 81    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "2-4 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       2-4 years          0.6959661
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 70 Engagement
## 7            eng_1                 89 Engagement
## 8            eng_2                 77 Engagement
## 9            eng_3                 69 Engagement
## 10           eng_4                 52 Engagement
## 11           eng_5                 60 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 2-4 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women with tenure between 4-6 years N = 58
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "4-6 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 58 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M009…     5     5     4     5     5     5     5     5     5     5     5     5
##  2 M007…     4     4     3     4     5     5     4     4     4     3     3     5
##  3 M022…     2     5     4     3     5     5     4     4     1     4     5     3
##  4 M001…     4     4     2     3     3     5     4     4     3     2     3     5
##  5 M017…     4     5     4     3     4     5     5     5     5     5     5     4
##  6 M002…     4     4     4     3     3    NA     5     4     4     3     3     5
##  7 M018…     5     5     5     5     5     5     5     5     5     5     5     5
##  8 M015…     5     5     5     5     5     5     5     5     5     5     5     5
##  9 M007…     4     4     2     4     4     4     4     4     4     4     3     4
## 10 M018…     4     3     3     3     3     3     4     4     4     4     4     4
## # ℹ 48 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8448276
## 2            ali_2          0.7894737
## 3            ali_3          0.5357143
## 4            col_1          0.6315789
## 5            col_2          0.6140351
## 6            col_3          0.7142857
## 7            eng_1          0.8620690
## 8            eng_2          0.7931034
## 9            eng_3          0.7368421
## 10           eng_4          0.4655172
## 11           eng_5          0.5000000
## 12           inc_1          0.7931034
## 13           inc_2          0.5964912
## 14           inc_3          0.6607143
## 15           inc_4          0.5818182
## 16           inc_5          0.6428571
## 17           lea_1          0.7321429
## 18           lea_2          0.6607143
## 19           lea_3          0.9107143
## 20           lea_4          0.8181818
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 84     Alignment
## 2            ali_2                 79     Alignment
## 3            ali_3                 54     Alignment
## 4            col_1                 63 Collaboration
## 5            col_2                 61 Collaboration
## 6            col_3                 71 Collaboration
## 7            eng_1                 86    Engagement
## 8            eng_2                 79    Engagement
## 9            eng_3                 74    Engagement
## 10           eng_4                 47    Engagement
## 11           eng_5                 50    Engagement
## 12           inc_1                 79     Inclusion
## 13           inc_2                 60     Inclusion
## 14           inc_3                 66     Inclusion
## 15           inc_4                 58     Inclusion
## 16           inc_5                 64     Inclusion
## 17           lea_1                 73    Leadership
## 18           lea_2                 66    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 82    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.7233385
## 2                 col              0.6532999
## 3                 eng              0.6715064
## 4                 inc              0.6549969
## 5                 lea              0.7804383
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 72     Alignment
## 1            ali_1                 84     Alignment
## 2            ali_2                 79     Alignment
## 3            ali_3                 54     Alignment
## 22             col                 65 Collaboration
## 4            col_1                 63 Collaboration
## 5            col_2                 61 Collaboration
## 6            col_3                 71 Collaboration
## 23             eng                 67    Engagement
## 7            eng_1                 86    Engagement
## 8            eng_2                 79    Engagement
## 9            eng_3                 74    Engagement
## 10           eng_4                 47    Engagement
## 11           eng_5                 50    Engagement
## 24             inc                 65     Inclusion
## 12           inc_1                 79     Inclusion
## 13           inc_2                 60     Inclusion
## 14           inc_3                 66     Inclusion
## 15           inc_4                 58     Inclusion
## 16           inc_5                 64     Inclusion
## 25             lea                 78    Leadership
## 17           lea_1                 73    Leadership
## 18           lea_2                 66    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 82    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "4-6 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       4-6 years          0.6715064
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 67 Engagement
## 7            eng_1                 86 Engagement
## 8            eng_2                 79 Engagement
## 9            eng_3                 74 Engagement
## 10           eng_4                 47 Engagement
## 11           eng_5                 50 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 4-6 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for women with tenure between 6-10 years N = 37
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-10 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 37 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M016…     5     5     5     5     4     5     5     5     4     3     4     5
##  2 M013…     4     3     3     3     3     3     4     3     3     2     2     4
##  3 M024…     5     5     3     5     5     5     4     4     4     3     4     4
##  4 M006…     5     5     5     5     3     4     5     5     5     5     4     5
##  5 M016…     5     5     3     4     4     4     5     5     4     4     4     5
##  6 M011…     5     5     5     4     5     5     5     4     4     5     5     5
##  7 M004…     5     5     4     4     3     4     5     5     5     4     4     5
##  8 M014…    NA    NA    NA     5    NA     2     4     4     4     3     4     5
##  9 M021…     4     4     3     4     3     4     4     4     3     3     4     4
## 10 M010…     1     1     1     1     1     3     2     2     1     1     1     1
## # ℹ 27 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Female" & 
  df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.7777778
## 2            ali_2          0.8055556
## 3            ali_3          0.4722222
## 4            col_1          0.7567568
## 5            col_2          0.5714286
## 6            col_3          0.7837838
## 7            eng_1          0.8648649
## 8            eng_2          0.7567568
## 9            eng_3          0.6216216
## 10           eng_4          0.5555556
## 11           eng_5          0.6666667
## 12           inc_1          0.9189189
## 13           inc_2          0.6111111
## 14           inc_3          0.7714286
## 15           inc_4          0.4285714
## 16           inc_5          0.6764706
## 17           lea_1          0.7567568
## 18           lea_2          0.7777778
## 19           lea_3          0.7837838
## 20           lea_4          0.7500000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 78     Alignment
## 2            ali_2                 81     Alignment
## 3            ali_3                 47     Alignment
## 4            col_1                 76 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 78 Collaboration
## 7            eng_1                 86    Engagement
## 8            eng_2                 76    Engagement
## 9            eng_3                 62    Engagement
## 10           eng_4                 56    Engagement
## 11           eng_5                 67    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 61     Inclusion
## 14           inc_3                 77     Inclusion
## 15           inc_4                 43     Inclusion
## 16           inc_5                 68     Inclusion
## 17           lea_1                 76    Leadership
## 18           lea_2                 78    Leadership
## 19           lea_3                 78    Leadership
## 20           lea_4                 75    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.6851852
## 2                 col              0.7039897
## 3                 eng              0.6930931
## 4                 inc              0.6813001
## 5                 lea              0.7670796
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 69     Alignment
## 1            ali_1                 78     Alignment
## 2            ali_2                 81     Alignment
## 3            ali_3                 47     Alignment
## 22             col                 70 Collaboration
## 4            col_1                 76 Collaboration
## 5            col_2                 57 Collaboration
## 6            col_3                 78 Collaboration
## 23             eng                 69    Engagement
## 7            eng_1                 86    Engagement
## 8            eng_2                 76    Engagement
## 9            eng_3                 62    Engagement
## 10           eng_4                 56    Engagement
## 11           eng_5                 67    Engagement
## 24             inc                 68     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 61     Inclusion
## 14           inc_3                 77     Inclusion
## 15           inc_4                 43     Inclusion
## 16           inc_5                 68     Inclusion
## 25             lea                 77    Leadership
## 17           lea_1                 76    Leadership
## 18           lea_2                 78    Leadership
## 19           lea_3                 78    Leadership
## 20           lea_4                 75    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Women with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-10 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1      6-10 years          0.6930931
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 69 Engagement
## 7            eng_1                 86 Engagement
## 8            eng_2                 76 Engagement
## 9            eng_3                 62 Engagement
## 10           eng_4                 56 Engagement
## 11           eng_5                 67 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Women with Tenure between 6-10 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men for Under 3 months N = 39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "Under 3 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 39 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M014…     5     5     4     5     3     1     5     5     5     5     5     5
##  2 M004…     3     4     5     5     5     5     4     4     5     5     4     5
##  3 M005…     4     4     4     4     4     4     5     5     5     4     5     4
##  4 M021…     4     4     4     4     3     3     5     5     4     5     5     4
##  5 M020…     5     5     5     5     5     5     5     4     5     5     5     5
##  6 M002…     4     4     4     4     4     4     4     4     4     4     4     5
##  7 M024…     4     4     3     4     3     3     5     1     5     3     4     4
##  8 M008…     4     4     4     4     3     4     4     5     3     4     4     4
##  9 M012…     4     4     4     4     3     4     5     4     5     4     4     4
## 10 M024…     5     5     5     5     5     5     5     5     4     5     5     5
## # ℹ 29 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "Under 3 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8461538
## 2            ali_2          0.9487179
## 3            ali_3          0.7948718
## 4            col_1          0.8717949
## 5            col_2          0.5142857
## 6            col_3          0.8717949
## 7            eng_1          1.0000000
## 8            eng_2          0.8974359
## 9            eng_3          0.8974359
## 10           eng_4          0.8461538
## 11           eng_5          0.8461538
## 12           inc_1          0.9487179
## 13           inc_2          0.7222222
## 14           inc_3          0.9210526
## 15           inc_4          0.7777778
## 16           inc_5          0.8974359
## 17           lea_1          0.7948718
## 18           lea_2          0.8974359
## 19           lea_3          0.9230769
## 20           lea_4          0.8205128
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 85     Alignment
## 2            ali_2                 95     Alignment
## 3            ali_3                 79     Alignment
## 4            col_1                 87 Collaboration
## 5            col_2                 51 Collaboration
## 6            col_3                 87 Collaboration
## 7            eng_1                100    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 85    Engagement
## 11           eng_5                 85    Engagement
## 12           inc_1                 95     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 78     Inclusion
## 16           inc_5                 90     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 82    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8632479
## 2                 col              0.7526252
## 3                 eng              0.8974359
## 4                 inc              0.8534413
## 5                 lea              0.8589744
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")

#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 85     Alignment
## 2            ali_2                 95     Alignment
## 3            ali_3                 79     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 87 Collaboration
## 5            col_2                 51 Collaboration
## 6            col_3                 87 Collaboration
## 23             eng                 90    Engagement
## 7            eng_1                100    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 85    Engagement
## 11           eng_5                 85    Engagement
## 24             inc                 85     Inclusion
## 12           inc_1                 95     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 92     Inclusion
## 15           inc_4                 78     Inclusion
## 16           inc_5                 90     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 90    Leadership
## 19           lea_3                 92    Leadership
## 20           lea_4                 82    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure under 3 Months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "Under 3 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1  Under 3 months          0.8974359
#create a graph that shows engagement favorable score across tenure groups 

engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 90 Engagement
## 7            eng_1                100 Engagement
## 8            eng_2                 90 Engagement
## 9            eng_3                 90 Engagement
## 10           eng_4                 85 Engagement
## 11           eng_5                 85 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure Under 3 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

- j. favorable scores for all questions for Men with tenure between 3-6 months N = 116

table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "3-6 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 209 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M003…     5     5     4     5     4     5     5     5     5     4     3     5
##  2 M002…     5     5     5     5     5     5     5     5     5     4     4     5
##  3 M021…     5     4     4     4     3     5     5     3     4     2     4     5
##  4 M024…     5     4     2     4     3     3     4     5     4     3     3     4
##  5 M014…     4     4     4     4     4     4     4     2     3     2     4     3
##  6 M017…     5     5     5     5     5     5     5     5     5     5     5     5
##  7 M020…     4     4     3     4     4     4     5     5     4     4     5     5
##  8 M010…     5     5     4     5    NA    NA     5     5     5     5     5     5
##  9 M023…     5     5     4     5     5     5     5     5     5     5     5     5
## 10 M008…     5     5     5     5     5     5     5     5     5     5     5     5
## # ℹ 199 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "3-6 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9320388
## 2            ali_2          0.8888889
## 3            ali_3          0.7584541
## 4            col_1          0.9033816
## 5            col_2          0.5322581
## 6            col_3          0.8689320
## 7            eng_1          0.9567308
## 8            eng_2          0.8990385
## 9            eng_3          0.8990385
## 10           eng_4          0.8365385
## 11           eng_5          0.8653846
## 12           inc_1          0.9268293
## 13           inc_2          0.7587940
## 14           inc_3          0.8712871
## 15           inc_4          0.7450000
## 16           inc_5          0.7980296
## 17           lea_1          0.8743961
## 18           lea_2          0.9077670
## 19           lea_3          0.9275362
## 20           lea_4          0.9275362
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 93     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 76     Alignment
## 4            col_1                 90 Collaboration
## 5            col_2                 53 Collaboration
## 6            col_3                 87 Collaboration
## 7            eng_1                 96    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 84    Engagement
## 11           eng_5                 87    Engagement
## 12           inc_1                 93     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 87     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 17           lea_1                 87    Leadership
## 18           lea_2                 91    Leadership
## 19           lea_3                 93    Leadership
## 20           lea_4                 93    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8597939
## 2                 col              0.7681906
## 3                 eng              0.8913462
## 4                 inc              0.8199880
## 5                 lea              0.9093089
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 86     Alignment
## 1            ali_1                 93     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 76     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 90 Collaboration
## 5            col_2                 53 Collaboration
## 6            col_3                 87 Collaboration
## 23             eng                 89    Engagement
## 7            eng_1                 96    Engagement
## 8            eng_2                 90    Engagement
## 9            eng_3                 90    Engagement
## 10           eng_4                 84    Engagement
## 11           eng_5                 87    Engagement
## 24             inc                 82     Inclusion
## 12           inc_1                 93     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 87     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 80     Inclusion
## 25             lea                 91    Leadership
## 17           lea_1                 87    Leadership
## 18           lea_2                 91    Leadership
## 19           lea_3                 93    Leadership
## 20           lea_4                 93    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 3-6 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "3-6 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1      3-6 months          0.8913462
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 89 Engagement
## 7            eng_1                 96 Engagement
## 8            eng_2                 90 Engagement
## 9            eng_3                 90 Engagement
## 10           eng_4                 84 Engagement
## 11           eng_5                 87 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 3-6 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men with tenure between 6-12 months N = 181
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-12 months" & 
  !is.na(df_2$gender), ]
## # A tibble: 325 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M012…     4     3     4     3     3     3     4     3     4     3     3     4
##  2 M025…     5     5     4     4     5     5     5     4     5     4     5     5
##  3 M007…     4     3     3     5     3     4     4     4     3     3     3     4
##  4 M020…     5     5     5     5     5     5     5     5     4     5     5     5
##  5 M024…     2     4     3     3     4     4     3     1     2     3     2     3
##  6 M019…     5     5     5     5     5     5     5     5     5     5     5     5
##  7 M007…     5     4     3     4     4     3     5     5     5     5     5     3
##  8 M013…     5     5     5     5     5     5     5     5     5     5     5     5
##  9 M023…     5     5     5     5     5     5     4     5     5     4     5     4
## 10 M001…     4     4     4     2     3     3     4     3     5     4     4     4
## # ℹ 315 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-12 months" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8981481
## 2            ali_2          0.8452012
## 3            ali_3          0.6677019
## 4            col_1          0.8664596
## 5            col_2          0.5460317
## 6            col_3          0.8204334
## 7            eng_1          0.9166667
## 8            eng_2          0.8307692
## 9            eng_3          0.8240741
## 10           eng_4          0.7292308
## 11           eng_5          0.7507692
## 12           inc_1          0.9068323
## 13           inc_2          0.7746032
## 14           inc_3          0.8593750
## 15           inc_4          0.6813880
## 16           inc_5          0.7875000
## 17           lea_1          0.8456790
## 18           lea_2          0.8641975
## 19           lea_3          0.9138462
## 20           lea_4          0.8947368
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 87 Collaboration
## 5            col_2                 55 Collaboration
## 6            col_3                 82 Collaboration
## 7            eng_1                 92    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 73    Engagement
## 11           eng_5                 75    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 86    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8036837
## 2                 col              0.7443083
## 3                 eng              0.8103020
## 4                 inc              0.8019397
## 5                 lea              0.8796149
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 74 Collaboration
## 4            col_1                 87 Collaboration
## 5            col_2                 55 Collaboration
## 6            col_3                 82 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 83    Engagement
## 9            eng_3                 82    Engagement
## 10           eng_4                 73    Engagement
## 11           eng_5                 75    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 68     Inclusion
## 16           inc_5                 79     Inclusion
## 25             lea                 88    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 86    Leadership
## 19           lea_3                 91    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-12 months") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-12 months",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1     6-12 months           0.810302
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 81 Engagement
## 7            eng_1                 92 Engagement
## 8            eng_2                 83 Engagement
## 9            eng_3                 82 Engagement
## 10           eng_4                 73 Engagement
## 11           eng_5                 75 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6 and 12 Months") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men with tenure between 1-2 years N =
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "1-2 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 459 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M006…     4     4     4     4     4     4     4     4     4     3     3     4
##  2 M014…     5     5     4     5     5     5     5     5     4     3     3     5
##  3 M013…     4     5     4     5     4     5     5     5     5     4     5     5
##  4 M000…     5     4     4     3     5     3     5     5     5     5     5     5
##  5 M020…     5     4     3     5     3     4     5     5     3     4     4     5
##  6 M016…     3     3     2     4     3     4     4     4     4     3     3     3
##  7 M017…     4     4     3     4     4     4     4     4     5     4     4     4
##  8 M012…     5     5     4     5     4     4     4     4     4     5     5     4
##  9 M015…     3     4     4     4     3     5     4     4     4     4     3     4
## 10 M022…     5     5     5     5     4     3     5     5     5     5     5     5
## # ℹ 449 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "1-2 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9102845
## 2            ali_2          0.8231441
## 3            ali_3          0.6710240
## 4            col_1          0.8518519
## 5            col_2          0.6228070
## 6            col_3          0.8358862
## 7            eng_1          0.9387309
## 8            eng_2          0.8711790
## 9            eng_3          0.8275109
## 10           eng_4          0.6914661
## 11           eng_5          0.7227074
## 12           inc_1          0.9080963
## 13           inc_2          0.7550562
## 14           inc_3          0.8505495
## 15           inc_4          0.7676991
## 16           inc_5          0.7802198
## 17           lea_1          0.8769231
## 18           lea_2          0.8796499
## 19           lea_3          0.8903509
## 20           lea_4          0.9082969
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 91     Alignment
## 2            ali_2                 82     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 83    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 77     Inclusion
## 16           inc_5                 78     Inclusion
## 17           lea_1                 88    Leadership
## 18           lea_2                 88    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 91    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8014842
## 2                 col              0.7701817
## 3                 eng              0.8103189
## 4                 inc              0.8123242
## 5                 lea              0.8888052
#change the factor favorable scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 91     Alignment
## 2            ali_2                 82     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 77 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 62 Collaboration
## 6            col_3                 84 Collaboration
## 23             eng                 81    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 83    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 81     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 85     Inclusion
## 15           inc_4                 77     Inclusion
## 16           inc_5                 78     Inclusion
## 25             lea                 89    Leadership
## 17           lea_1                 88    Leadership
## 18           lea_2                 88    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 91    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 1-2 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "1-2 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       1-2 years          0.8103189
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 81 Engagement
## 7            eng_1                 94 Engagement
## 8            eng_2                 87 Engagement
## 9            eng_3                 83 Engagement
## 10           eng_4                 69 Engagement
## 11           eng_5                 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 1-2 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men with tenure between 2-4 years N =
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "2-4 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 453 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…    NA    NA    NA    NA    NA    NA     4    NA    NA    NA    NA    NA
##  2 M004…     4     5     5     4     5     5     5     3     4     4     4     4
##  3 M014…     4     4     2     3     2     3     4     4     3     2     3     5
##  4 M009…     4     5     4     4     5     5     4     1     1     3     4     5
##  5 M020…     5     5     5     5     4     5     5     5     5     5     5     5
##  6 M001…     5     5     4     3     3     4     5     5     4     2     3     5
##  7 M014…     5     5     1     5     3     2     5     5     5     4     4     5
##  8 M011…     4     5     4     4     2     1     5     4     4     4     4     4
##  9 M012…     5     4     4     5     3     5     5     5     4     5     5     5
## 10 M013…     4     4     5     4     3     4     4     5     4     4     4     4
## # ℹ 443 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "2-4 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8953229
## 2            ali_2          0.8459821
## 3            ali_3          0.6674107
## 4            col_1          0.8177778
## 5            col_2          0.5915179
## 6            col_3          0.7438753
## 7            eng_1          0.9247788
## 8            eng_2          0.8716814
## 9            eng_3          0.7977778
## 10           eng_4          0.6615044
## 11           eng_5          0.7323009
## 12           inc_1          0.9159292
## 13           inc_2          0.7652370
## 14           inc_3          0.8224719
## 15           inc_4          0.7042889
## 16           inc_5          0.7645740
## 17           lea_1          0.8466667
## 18           lea_2          0.8530067
## 19           lea_3          0.8824834
## 20           lea_4          0.8933333
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 4            col_1                 82 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 74 Collaboration
## 7            eng_1                 92    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 66    Engagement
## 11           eng_5                 73    Engagement
## 12           inc_1                 92     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 82     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 76     Inclusion
## 17           lea_1                 85    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 88    Leadership
## 20           lea_4                 89    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8029053
## 2                 col              0.7177236
## 3                 eng              0.7976087
## 4                 inc              0.7945002
## 5                 lea              0.8688725
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100


library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 80     Alignment
## 1            ali_1                 90     Alignment
## 2            ali_2                 85     Alignment
## 3            ali_3                 67     Alignment
## 22             col                 72 Collaboration
## 4            col_1                 82 Collaboration
## 5            col_2                 59 Collaboration
## 6            col_3                 74 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 92    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 66    Engagement
## 11           eng_5                 73    Engagement
## 24             inc                 79     Inclusion
## 12           inc_1                 92     Inclusion
## 13           inc_2                 77     Inclusion
## 14           inc_3                 82     Inclusion
## 15           inc_4                 70     Inclusion
## 16           inc_5                 76     Inclusion
## 25             lea                 87    Leadership
## 17           lea_1                 85    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 88    Leadership
## 20           lea_4                 89    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 2-4 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "2-4 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       2-4 years          0.7976087
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 80 Engagement
## 7            eng_1                 92 Engagement
## 8            eng_2                 87 Engagement
## 9            eng_3                 80 Engagement
## 10           eng_4                 66 Engagement
## 11           eng_5                 73 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 2-4 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men with tenure between 4-6 years N =
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "4-6 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 216 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  2 M001…     5     5     4     5     5     5     5     5     5     4     3     5
##  3 M002…     5     5     4     5     4     4     5     5     5     5     5     5
##  4 M018…     4     4     4     2     5     5     4     1     3     4     3     4
##  5 M013…     5     5     4     4    NA     5     5     5     5     4     5     5
##  6 M006…     5     5     2     5     3     4     5     5     5     4     2     4
##  7 M009…     5     5     3     5     5     5     5     5     5     5     5     5
##  8 M005…     4     4     4     4     3     3     4     4     4     4     4     4
##  9 M023…     5     5     5     4     5     5     5     4     5     3     3     5
## 10 M007…     5     5     3     5     4     5     5     5     5     5     5     5
## # ℹ 206 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "4-6 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.8930233
## 2            ali_2          0.8262911
## 3            ali_3          0.7289720
## 4            col_1          0.8465116
## 5            col_2          0.6525822
## 6            col_3          0.7428571
## 7            eng_1          0.9120370
## 8            eng_2          0.8697674
## 9            eng_3          0.8000000
## 10           eng_4          0.6744186
## 11           eng_5          0.6481481
## 12           inc_1          0.9052133
## 13           inc_2          0.7572816
## 14           inc_3          0.8564593
## 15           inc_4          0.7439614
## 16           inc_5          0.7403846
## 17           lea_1          0.7943925
## 18           lea_2          0.8651163
## 19           lea_3          0.8925234
## 20           lea_4          0.8738318
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 73     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 74 Collaboration
## 7            eng_1                 91    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 67    Engagement
## 11           eng_5                 65    Engagement
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 74     Inclusion
## 17           lea_1                 79    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 87    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8160954
## 2                 col              0.7473170
## 3                 eng              0.7808742
## 4                 inc              0.8006600
## 5                 lea              0.8564660
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 82     Alignment
## 1            ali_1                 89     Alignment
## 2            ali_2                 83     Alignment
## 3            ali_3                 73     Alignment
## 22             col                 75 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 65 Collaboration
## 6            col_3                 74 Collaboration
## 23             eng                 78    Engagement
## 7            eng_1                 91    Engagement
## 8            eng_2                 87    Engagement
## 9            eng_3                 80    Engagement
## 10           eng_4                 67    Engagement
## 11           eng_5                 65    Engagement
## 24             inc                 80     Inclusion
## 12           inc_1                 91     Inclusion
## 13           inc_2                 76     Inclusion
## 14           inc_3                 86     Inclusion
## 15           inc_4                 74     Inclusion
## 16           inc_5                 74     Inclusion
## 25             lea                 86    Leadership
## 17           lea_1                 79    Leadership
## 18           lea_2                 87    Leadership
## 19           lea_3                 89    Leadership
## 20           lea_4                 87    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 4-6 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "4-6 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1       4-6 years          0.7808742
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 78 Engagement
## 7            eng_1                 91 Engagement
## 8            eng_2                 87 Engagement
## 9            eng_3                 80 Engagement
## 10           eng_4                 67 Engagement
## 11           eng_5                 65 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 4-6 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors)  

    1. favorable scores for all questions for Men with tenure between 6-10 years N =
table(df_2$gender, df_2$tenure_group)
##         
##          1-2 years 10+ years 2-4 years 3-6 months 4-6 years 6-10 years
##   Female       211         1       191        116        58         37
##   Male         459         3       453        209       216        126
##         
##          6-12 months Under 3 months
##   Female         181             25
##   Male           325             39
df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-10 years" & 
  !is.na(df_2$gender), ]
## # A tibble: 126 × 29
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…     4     4     4     4     4     3     4     4     4     4     4     4
##  2 M021…     5     5     4     5     4     5     5     5     4     5     5     5
##  3 M025…     4     4     4     4     4     4     5     5     3     3     4     4
##  4 M022…     3     3     4     5     3     3     4     3     3     2     2     4
##  5 M016…     5     5     4     5     5     5     5     5     5     5     5     5
##  6 M016…     4     4     3     4     3     4     4     4     3     3     3     4
##  7 M011…     4    NA    NA     4     4    NA     4     4     4     4     3     4
##  8 M011…     5     5     5     4     3     3     4     5     4     4     4     4
##  9 M023…     4     4     4     4     4     4     4     5     3     2     4     4
## 10 M014…    NA    NA    NA     2     2     1     4     4     3     2     3     2
## # ℹ 116 more rows
## # ℹ 16 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>
# Calculate favorability_score for columns 2 to 21 by using function_favorability I created.
favorability_score <- function_favorability(df_2[
  df_2$gender == "Male" & 
  df_2$tenure_group == "6-10 years" , 2:21])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##    question_number favorability_score
## 1            ali_1          0.9354839
## 2            ali_2          0.8943089
## 3            ali_3          0.7723577
## 4            col_1          0.8492063
## 5            col_2          0.7200000
## 6            col_3          0.7680000
## 7            eng_1          0.9365079
## 8            eng_2          0.8888889
## 9            eng_3          0.7698413
## 10           eng_4          0.6904762
## 11           eng_5          0.7200000
## 12           inc_1          0.8861789
## 13           inc_2          0.7166667
## 14           inc_3          0.7786885
## 15           inc_4          0.6290323
## 16           inc_5          0.7704918
## 17           lea_1          0.8080000
## 18           lea_2          0.8548387
## 19           lea_3          0.8480000
## 20           lea_4          0.8560000
favorability_score_percent <- favorability_score 
favorability_score_percent$favorability_score <- round(favorability_score_percent$favorability_score,2)*100
  


library(dplyr)
favorability_score_percent$Factor <- dplyr::case_when(
  favorability_score_percent$question_number %in% c("ali_1", "ali_2", "ali_3") ~ "Alignment",
  favorability_score_percent$question_number %in% c("col_1", "col_2", "col_3") ~ "Collaboration",
  favorability_score_percent$question_number %in% c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5") ~ "Engagement",
  favorability_score_percent$question_number %in% c("inc_1", "inc_2", "inc_3", "inc_4", "inc_5") ~ "Inclusion",
  favorability_score_percent$question_number %in% c("lea_1", "lea_2", "lea_3", "lea_4") ~ "Leadership",
  TRUE ~ "Other")



print(favorability_score_percent)
##    question_number favorability_score        Factor
## 1            ali_1                 94     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 77     Alignment
## 4            col_1                 85 Collaboration
## 5            col_2                 72 Collaboration
## 6            col_3                 77 Collaboration
## 7            eng_1                 94    Engagement
## 8            eng_2                 89    Engagement
## 9            eng_3                 77    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 12           inc_1                 89     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 78     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 77     Inclusion
## 17           lea_1                 81    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 85    Leadership
## 20           lea_4                 86    Leadership
## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(factor_favorable_score,
                                  data.frame(factor_abbreviation = factor_abbr,
                                             factor_favorable_score = avg_favorable_score))
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score
## 1                 ali              0.8673835
## 2                 col              0.7790688
## 3                 eng              0.8011429
## 4                 inc              0.7562116
## 5                 lea              0.8417097
#change the favorability scores into percentage 
factor_favorability_score_percent <- factor_favorable_score
factor_favorability_score_percent$factor_favorable_score <- round(factor_favorability_score_percent$factor_favorable_score,2)*100

library(dplyr)
factor_favorability_score_percent$Factor <- dplyr::case_when(
  factor_favorability_score_percent$factor_abbreviation %in% c("ali") ~ "Alignment",
  factor_favorability_score_percent$factor_abbreviation %in% c("col") ~ "Collaboration",
  factor_favorability_score_percent$factor_abbreviation %in% c("eng") ~ "Engagement",
  factor_favorability_score_percent$factor_abbreviation %in% c("inc") ~ "Inclusion",
  factor_favorability_score_percent$factor_abbreviation %in% c("lea") ~ "Leadership",
  TRUE ~ "Other")



#combine the table with favorable score for each item with the table with favorable score for each factor
colnames(factor_favorability_score_percent) <- c("question_number", "favorability_score", "Factor")
combined_favorability_df <- rbind(favorability_score_percent, factor_favorability_score_percent)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Factor
## 21             ali                 87     Alignment
## 1            ali_1                 94     Alignment
## 2            ali_2                 89     Alignment
## 3            ali_3                 77     Alignment
## 22             col                 78 Collaboration
## 4            col_1                 85 Collaboration
## 5            col_2                 72 Collaboration
## 6            col_3                 77 Collaboration
## 23             eng                 80    Engagement
## 7            eng_1                 94    Engagement
## 8            eng_2                 89    Engagement
## 9            eng_3                 77    Engagement
## 10           eng_4                 69    Engagement
## 11           eng_5                 72    Engagement
## 24             inc                 76     Inclusion
## 12           inc_1                 89     Inclusion
## 13           inc_2                 72     Inclusion
## 14           inc_3                 78     Inclusion
## 15           inc_4                 63     Inclusion
## 16           inc_5                 77     Inclusion
## 25             lea                 84    Leadership
## 17           lea_1                 81    Leadership
## 18           lea_2                 85    Leadership
## 19           lea_3                 85    Leadership
## 20           lea_4                 86    Leadership
## Create a ggplot bar plot with the preselected colors
library(ggplot2)
ggplot(combined_favorability_df, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

## Create a ggplot bar plot for factor favorable scores

library(ggplot2)
ggplot(factor_favorability_score_percent, aes(x = question_number, y = favorability_score, fill = Factor)) +
  geom_bar(stat = "identity", show.legend = TRUE) + 
  labs(x = "Question Item", y = "Favorability Score", title = "Favorable Score for Each Factor and Question Item for Men with Tenure between 6-10 years") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  
    plot.title = element_text(hjust = 0.5)
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5) +  # Add text labels to the bars
  scale_fill_manual(values = group_colors)  # Apply custom colors to the bars

#create a df storing each group's engagement factor favorability score gradually.
engagement_by_tenure <- data.frame(
  question_number = "6-10 years",  
  favorability_score = factor_favorable_score$factor_favorable_score[factor_favorable_score$factor_abbreviation == "eng"])

engagement_by_tenure
##   question_number favorability_score
## 1      6-10 years          0.8011429
engagement_favorable_score_for_all <- combined_favorability_df[combined_favorability_df$question_number %in% c("eng", "eng_1", "eng_2", "eng_3", "eng_4", "eng_5"), ]


engagement_favorable_score_for_all
##    question_number favorability_score     Factor
## 23             eng                 80 Engagement
## 7            eng_1                 94 Engagement
## 8            eng_2                 89 Engagement
## 9            eng_3                 77 Engagement
## 10           eng_4                 69 Engagement
## 11           eng_5                 72 Engagement
# Distinguish the average row from other item rows
engagement_favorable_score_for_all$highlight <- ifelse(engagement_favorable_score_for_all$question_number == engagement_favorable_score_for_all$question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category

ggplot(engagement_favorable_score_for_all, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE) +  
  labs(x = "Factor Items", y = "Favorability Score", title = "Engagement Factor Favorable Scores for Men with Tenure between 6-10 Years") + 
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(favorability_score, "%")), vjust = -0.5, size = 4) +  
  scale_fill_manual(values = highlight_colors) 

    1. Correlate all non-engagement questions to the engagement factor for Hooli Overall.
      1. for all employees & all non-engagement items (Kendall’s tau-b) The results from the correlations calculated by conducting Kendall’s tau-b and pearson’s r are largley convergent while kendall’s tau-b’s results are much more conservative in terms of the size of the correlations.
#Create a new variable named 'engagement_factor', representing the average engagement scores.
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

#Select only the numeric columns from 'df_2' excluding 'eng_1' to 'eng_5'
numeric_df_2 <- Filter(is.numeric, df_2)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

library(corrplot)
## corrplot 0.92 loaded
# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 2,651 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4             4          3.8       4.75
##  3      4.67          4.67       5         5   
##  4      5             5          4.4       5   
##  5      4.67          5          4.8       5   
##  6      3.67          4          3.4       3.25
##  7      3.67          3          4         5   
##  8      4.67          4.67       4.6       5   
##  9      4.67          4.33       4.8       5   
## 10      5             4.67       4.6       4.5 
## # ℹ 2,641 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 2,651 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        3.6
##  3        4.4
##  4        5  
##  5        4.4
##  6        3.8
##  7        3.4
##  8        4.6
##  9        5  
## 10        4.4
## # ℹ 2,641 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

 -  b. for women only (correlation calculated by conducting kendall's tau-b)
library(dplyr)

# Subset df_2 to include only female employees
df_2_female <- df_2 %>%
  filter(gender == "Female")

# Calculate the engagement factor for female employees
df_2_female$engagement_factor <- rowMeans(df_2_female[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)


numeric_df_2 <- Filter(is.numeric, df_2_female)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Women", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Female", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 821 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      3.67          4           3.8       3.4       3.25
##  2      5             4.67        4.4       4.6       4.5 
##  3      4.33          4.33        4.8       4.6       5   
##  4      4.33          4.33        5         4.4       4.25
##  5      4.33          4           3.8       4.6       4.75
##  6      4.33          4           4.2       3.4       4   
##  7      5             5           5         4.6       5   
##  8      2             3.33        2.6       2.8       4   
##  9      4.67          4.33        3.6       5         4.25
## 10      4             4           4.4       4.4       4   
## # ℹ 811 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 821 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      3.67          4          3.4       3.25
##  2      5             4.67       4.6       4.5 
##  3      4.33          4.33       4.6       5   
##  4      4.33          4.33       4.4       4.25
##  5      4.33          4          4.6       4.75
##  6      4.33          4          3.4       4   
##  7      5             5          4.6       5   
##  8      2             3.33       2.8       4   
##  9      4.67          4.33       5         4.25
## 10      4             4          4.4       4   
## # ℹ 811 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 821 × 1
##    Engagement
##         <dbl>
##  1        3.8
##  2        4.4
##  3        4.8
##  4        5  
##  5        3.8
##  6        4.2
##  7        5  
##  8        2.6
##  9        3.6
## 10        4.4
## # ℹ 811 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Women", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

c. for male employees (correlation calculated by conducting kendall’s tau-b)

library(dplyr)

# Subset df_2 to include only male employees
df_2_male <- df_2 %>%
  filter(gender == "Male")

# Calculate the engagement factor for female employees
df_2_male$engagement_factor <- rowMeans(df_2_male[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_male)
## # A tibble: 6 × 30
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01434     5     5     4     5     3     1     5     5     5     5     5     5
## 2 M00631     4     4     4     4     4     4     4     4     4     3     3     4
## 3 M00325     5     5     4     5     4     5     5     5     5     4     3     5
## 4 M00805     5     5     5     5     5     5     5     5     5     5     5     5
## 5 M00157     5     5     4     5     5     5     5     5     5     4     3     5
## 6 M01220     4     3     4     3     3     3     4     3     4     3     3     4
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_male)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Men", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$gender == "Male", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 1,831 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          3           3.4       4         5   
##  7      4.67          4.67        4.6       4.6       5   
##  8      4.67          4.33        5         4.8       5   
##  9      4.67          5           4         5         5   
## 10      4             3.67        4         3.6       4   
## # ℹ 1,821 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,831 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4             4          3.8       4.75
##  3      4.67          4.67       5         5   
##  4      5             5          4.4       5   
##  5      4.67          5          4.8       5   
##  6      3.67          3          4         5   
##  7      4.67          4.67       4.6       5   
##  8      4.67          4.33       4.8       5   
##  9      4.67          5          5         5   
## 10      4             3.67       3.6       4   
## # ℹ 1,821 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,831 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        3.6
##  3        4.4
##  4        5  
##  5        4.4
##  6        3.4
##  7        4.6
##  8        5  
##  9        4  
## 10        4  
## # ℹ 1,821 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Men", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

d. for Indian employees (correlation calculated by conducting kendall’s tau-b)

library(dplyr)

# Subset df_2 to include only Indian employees
df_2_india <- df_2 %>%
  filter(country == "India")

# Calculate the engagement factor for female employees
df_2_india$engagement_factor <- rowMeans(df_2_india[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_india)
## # A tibble: 6 × 30
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M01471     5     5     4     5     5     5     5     5     4     3     3     5
## 2 M02064     5     4     3     5     3     4     5     5     3     4     4     5
## 3 M00210     5     5     5     5     5     5     5     5     5     4     4     5
## 4 M02430     5     5     4     3     5     5     5     5     3     2     3     5
## 5 M01775     5     5     5     5     5     5     5     5     5     5     5     5
## 6 M01748     3     4     3     4     3     5     5     5     5     4     5     2
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_india)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Indian Employees", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "India", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 288 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          5           4         5         5   
##  2      4             4           4.2       5         5   
##  3      5             5           4.6       5         5   
##  4      4.67          4.33        3.6       5         4.25
##  5      5             5           5         5         5   
##  6      3.33          4           4.8       3.6       4   
##  7      4.67          5           4.2       5         5   
##  8      3             3.67        4.6       4.2       3.5 
##  9      4             4           4.8       4         4   
## 10      4.67          5           5         5         5   
## # ℹ 278 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 288 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          5          5         5   
##  2      4             4          5         5   
##  3      5             5          5         5   
##  4      4.67          4.33       5         4.25
##  5      5             5          5         5   
##  6      3.33          4          3.6       4   
##  7      4.67          5          5         5   
##  8      3             3.67       4.2       3.5 
##  9      4             4          4         4   
## 10      4.67          5          5         5   
## # ℹ 278 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 288 × 1
##    Engagement
##         <dbl>
##  1        4  
##  2        4.2
##  3        4.6
##  4        3.6
##  5        5  
##  6        4.8
##  7        4.2
##  8        4.6
##  9        4.8
## 10        5  
## # ℹ 278 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Indian Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

e. for German employees (correlation calculated by conducting kendall’s tau-b)

library(dplyr)

# Subset df_2 to include only Indian employees
df_2_germany <- df_2 %>%
  filter(country == "Germany")

# Calculate the engagement factor for german employees
df_2_germany$engagement_factor <- rowMeans(df_2_germany[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

# Display a few rows to check the new variable
head(df_2_germany)
## # A tibble: 6 × 30
##   eeid   ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##   <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 M00166     5     5     4     3     3     4     5     5     4     2     3     5
## 2 M01969     4     4     4     5     5     5     5     5     4     3     5     4
## 3 M01232     4     4     3     3     3     3     3     4     4     3     4     3
## 4 M01944     3     4     4     4     2     4     4     4     3     3     3     4
## 5 M00395     4     4     5     4     4     4     5     4     5     4     4     5
## 6 M00532     5     5     3     4     2     2     3     4     4     2     3     4
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
numeric_df_2 <- Filter(is.numeric, df_2_germany)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for German Employees", caption = "Correlations calculated using Pearson's r"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2[df_2$country == "Germany", ] %>%
  mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 48 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3.33        3.8       4         4   
##  2      4             5           4.4       3.8       5   
##  3      3.67          3           3.6       2.8       3.5 
##  4      3.67          3.33        3.4       3.6       4   
##  5      4.33          4           4.4       5         4.75
##  6      4.33          2.67        3.2       3.4       2.75
##  7      4             3.67        4.4       4.8       4.25
##  8      4             3.33        4         5         5   
##  9      5             4.33        4.6       5         5   
## 10      3.67          3           3         5         4   
## # ℹ 38 more rows
composite_factor_scores <- subset(composite_scores, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 48 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3.33       4         4   
##  2      4             5          3.8       5   
##  3      3.67          3          2.8       3.5 
##  4      3.67          3.33       3.6       4   
##  5      4.33          4          5         4.75
##  6      4.33          2.67       3.4       2.75
##  7      4             3.67       4.8       4.25
##  8      4             3.33       5         5   
##  9      5             4.33       5         5   
## 10      3.67          3          5         4   
## # ℹ 38 more rows
engagement_factor_scores <- subset(composite_scores, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 48 × 1
##    Engagement
##         <dbl>
##  1        3.8
##  2        4.4
##  3        3.6
##  4        3.4
##  5        4.4
##  6        3.2
##  7        4.4
##  8        4  
##  9        4.6
## 10        3  
## # ℹ 38 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for German Employees", caption = "Correlations calculated using Kendall's tau-b"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

f. for employees who are high (engagement factor composite score > or = 4) on employee_engagement (correlation calculated by conducting kendall’s tau-b)

library(dplyr)

# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

hist(df_2$engagement_factor)

high_engagement <- df_2 %>%
  filter(engagement_factor >= 4)

numeric_df_2 <- Filter(is.numeric, high_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for highly engaging employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Pearson's r | employees with engagement composite factor score greater than or equal to 4 are only included for analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_scores_high <- composite_scores %>%
  filter(Engagement >= 4)

composite_factor_scores <- subset(composite_scores_high, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 1,819 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      4.67          3          4.6       4.75
##  2      4.67          4.67       5         5   
##  3      5             5          4.4       5   
##  4      4.67          5          4.8       5   
##  5      4.67          4.67       4.6       5   
##  6      4.67          4.33       4.8       5   
##  7      5             4.67       4.6       4.5 
##  8      4.67          5          5         5   
##  9      4             3.67       3.6       4   
## 10    NaN           NaN        NaN       NaN   
## # ℹ 1,809 more rows
engagement_factor_scores <- subset(composite_scores_high, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 1,819 × 1
##    Engagement
##         <dbl>
##  1        5  
##  2        4.4
##  3        5  
##  4        4.4
##  5        4.6
##  6        5  
##  7        4.4
##  8        4  
##  9        4  
## 10        4  
## # ℹ 1,809 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Highly Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | employees with engagement composite factor score greater than or equal to 4 are only included for analysis" 
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

  1. for employees who are low (engagement factor composite score < or = 2) on employee_engagement (correlation calculated by conducting kendall’s tau-b) = 47 employees only
library(dplyr)

# Calculate the engagement factor for employees
df_2$engagement_factor <- rowMeans(df_2[, c("eng_1", "eng_2", "eng_3", "eng_4", "eng_5")], na.rm = TRUE)

hist(df_2$engagement_factor)

low_engagement <- df_2 %>%
  filter(engagement_factor <= 2)

numeric_df_2 <- Filter(is.numeric, low_engagement)
library(dplyr)
numeric_df_2 <- subset(numeric_df_2, select= -c(eng_1, eng_2, eng_3, eng_4, eng_5))


#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(numeric_df_2, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "engagement_factor"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "engagement_factor", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[13:15, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

# Recreate the Pearson's r correlation matrix to see if they are too divergent
correlation_matrix <- cor(numeric_df_2, use = "complete.obs")
corrplot(correlation_matrix, type = "upper", order = "hclust")

# Extract correlations for "engagement_factor"
engagement_correlations <- correlation_matrix[, "engagement_factor"]

engagement_factor_correlation_table <- data.frame(
  variable = names(engagement_correlations),
  correlation = engagement_correlations
)

engagement_factor_correlation_table <- engagement_factor_correlation_table[
  engagement_factor_correlation_table$variable != "engagement_factor", 
]

# Sort in descending order to get the strongest correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table %>%
  arrange(correlation)

# Keep only the top 3 correlations
engagement_factor_correlation_table <- engagement_factor_correlation_table[13:15,]

# Reorder the factor levels
engagement_factor_correlation_table$variable <- factor(
  engagement_factor_correlation_table$variable,
  levels = engagement_factor_correlation_table$variable
)


# Create a ggplot with horizontal bars for the top 3 Pearson's r correlations
ggplot(engagement_factor_correlation_table, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Top 3 Factors with Strongest Associations to Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Pearson's r | Only employees with engagement composite factor score lower than or equal to 2 are included for this analysis"
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

#engagement factor's correlations with other factors
library(dplyr)

#Create a df with composite factor scores for each group of items
composite_scores <- df_2 %>% mutate(
    Alignment = rowMeans(select(., ali_1:ali_3), na.rm = TRUE),  # Alignment score
    Collaboration = rowMeans(select(., col_1:col_3), na.rm = TRUE),  # Collaboration score
    Engagement = rowMeans(select(., eng_1:eng_5), na.rm = TRUE),  # Engagement score
    Inclusion = rowMeans(select(., inc_1:inc_5), na.rm = TRUE),  # Inclusion score
    Leadership = rowMeans(select(., lea_1:lea_4), na.rm = TRUE)  # Leadership score
  ) %>%
  # Select only the composite factor scores and an identifier (if needed)
  select(Alignment, Collaboration, Engagement, Inclusion, Leadership)

# Print the new data frame to ensure it contains the expected composite scores
print(composite_scores)
## # A tibble: 2,651 × 5
##    Alignment Collaboration Engagement Inclusion Leadership
##        <dbl>         <dbl>      <dbl>     <dbl>      <dbl>
##  1      4.67          3           5         4.6       4.75
##  2      4             4           3.6       3.8       4.75
##  3      4.67          4.67        4.4       5         5   
##  4      5             5           5         4.4       5   
##  5      4.67          5           4.4       4.8       5   
##  6      3.67          4           3.8       3.4       3.25
##  7      3.67          3           3.4       4         5   
##  8      4.67          4.67        4.6       4.6       5   
##  9      4.67          4.33        5         4.8       5   
## 10      5             4.67        4.4       4.6       4.5 
## # ℹ 2,641 more rows
composite_scores_low <- composite_scores %>%
  filter(Engagement <= 2)

composite_factor_scores <- subset(composite_scores_low, select= c(Alignment, Collaboration, Inclusion, Leadership))
print(composite_factor_scores)
## # A tibble: 47 × 4
##    Alignment Collaboration Inclusion Leadership
##        <dbl>         <dbl>     <dbl>      <dbl>
##  1      3.33          4          4.2       1.75
##  2      2             1          2.2       3.25
##  3      2.33          3.33       2         2   
##  4      3             4          3.6       3   
##  5      1.67          4          2.8       2.5 
##  6      2.33          2          3         1   
##  7      2.33          3          3.8       3   
##  8      1.67          3.67       2.6       4.5 
##  9      3             2.67       3         2.5 
## 10      1.67          2.67       2.8       2.75
## # ℹ 37 more rows
engagement_factor_scores <- subset(composite_scores_low, select= -c(Alignment, Collaboration, Inclusion, Leadership))
print(engagement_factor_scores)
## # A tibble: 47 × 1
##    Engagement
##         <dbl>
##  1        1.8
##  2        1.2
##  3        1.8
##  4        2  
##  5        2  
##  6        1.4
##  7        2  
##  8        1.8
##  9        2  
## 10        1.8
## # ℹ 37 more rows
#Conduct Kendall's tau-b correlation to get correlations
correlation_matrix_kendall <- cor(composite_scores, method = "kendall", use = "complete.obs")

#Extract the correlations for "engagement_factor"
engagement_correlations_kendall <- correlation_matrix_kendall[, "Engagement"]

engagement_factor_correlation_table_kendall <- data.frame(
  variable = names(engagement_correlations_kendall),
  correlation = engagement_correlations_kendall
)

#Remove 'engagement_factor' and sort by correlation in descending order
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[
  engagement_factor_correlation_table_kendall$variable != "Engagement", 
]
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall %>%
  arrange(correlation)

#Keep only the top 3 strongest correlations
engagement_factor_correlation_table_kendall <- engagement_factor_correlation_table_kendall[1:4, ]

# Reorder the factor levels based on the new order
engagement_factor_correlation_table_kendall$variable <- factor(
  engagement_factor_correlation_table_kendall$variable,
  levels = engagement_factor_correlation_table_kendall$variable
)

# Use scales package to format correlations as percentages
library(scales)


# Create a ggplot with horizontal bars for the top 3 Kendall's tau-b correlations
ggplot(engagement_factor_correlation_table_kendall, aes(x = correlation, y = variable)) +
  geom_bar(stat = "identity", fill = "purple", alpha = 0.7) +
  geom_text(aes(label = percent(correlation, accuracy = 1)), hjust = -0.1, size = 3.5, color = "black")  +
  labs(x = "Correlation", y = "Variable", title = "Factors' relationships with Employee Engagement for Less Engaged Employees", caption = "Correlations calculated using Kendall's tau-b | Only employees with engagement composite factor score lower than or equal to 2 are included for analysis" 
  ) +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 12),
    axis.title = element_text(size = 14),
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    plot.caption = element_text(size = 10, hjust = 0)
  )

    1. Calculate engagement & leadership factor scores for all countries.
      1. Australia’s N = 101
df_2[df_2$country == "Australia" & !is.na(df_2$country), ]
## # A tibble: 101 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M017…     4     4     4     4     4     3     4     4     4     4     4     4
##  2 M017…     4     4     3     4     4     4     4     4     5     4     4     4
##  3 M012…     4     3     4     3     4     4     4     4     3     3     2     4
##  4 M006…     5     5     5     4     5     3     5     5     5     5     5     5
##  5 M024…     5     5     3     5     5     2     5     5     4     5     4     4
##  6 M025…     5     5     5     5     4     5     4     5     4     5     5     5
##  7 M002…     5     3     4     4     3     1     5     4     4     2     3     5
##  8 M000…     5     5     5     5     5     5     5     5     5     5     5     5
##  9 M012…     4     4     4     4     4     3     4     4     4     3     4     4
## 10 M023…     4     5     4     4     3     2     4     5     4     4     3     4
## # ℹ 91 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Australia", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9207921
## 2           eng_2          0.9306931
## 3           eng_3          0.8514851
## 4           eng_4          0.6930693
## 5           eng_5          0.7425743
## 6           lea_1          0.8613861
## 7           lea_2          0.9207921
## 8           lea_3          0.9603960
## 9           lea_4          0.9603960
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Australia"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score   country
## 1                 eng              0.8277228 Australia
## 2                 lea              0.9257426 Australia
favorability_score$Country <- "Australia" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score   Country
## 10             eng          0.8277228 Australia
## 1            eng_1          0.9207921 Australia
## 2            eng_2          0.9306931 Australia
## 3            eng_3          0.8514851 Australia
## 4            eng_4          0.6930693 Australia
## 5            eng_5          0.7425743 Australia
## 11             lea          0.9257426 Australia
## 6            lea_1          0.8613861 Australia
## 7            lea_2          0.9207921 Australia
## 8            lea_3          0.9603960 Australia
## 9            lea_4          0.9603960 Australia
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score   Country
## 10             eng          0.8277228 Australia
## 1            eng_1          0.9207921 Australia
## 2            eng_2          0.9306931 Australia
## 3            eng_3          0.8514851 Australia
## 4            eng_4          0.6930693 Australia
## 5            eng_5          0.7425743 Australia
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Aussie employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Aussie Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1)) 

- b. Canada’s N = 84

df_2[df_2$country == "Canada" & !is.na(df_2$country), ]
## # A tibble: 84 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M002…     5     5     4     5     4     4     5     5     5     5     5     5
##  2 M001…     5     5     3     5     3     5     5     5     5     4     5     5
##  3 M022…     5     5     5     5     4     3     5     5     5     5     5     5
##  4 M023…     5     5     5     5     5     5     4     5     5     4     5     4
##  5 M009…     4     4     4     5     5     5     4     4     4     4     4     5
##  6 M013…     5     5     4     5     5     5     5     5     5     4     5     5
##  7 M015…     4     4     4     4     4     4     4     4     4     4     4     4
##  8 M006…     5     5     4     4    NA     2     5     5     5     5     5     5
##  9 M012…     5     5     4     4     4     5     5     5     5     3     3     5
## 10 M019…     4     4     4     4     3     3     4     4     4     4     4     4
## # ℹ 74 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Canada", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9285714
## 2           eng_2          0.8333333
## 3           eng_3          0.8690476
## 4           eng_4          0.7619048
## 5           eng_5          0.7976190
## 6           lea_1          0.8452381
## 7           lea_2          0.8333333
## 8           lea_3          0.9404762
## 9           lea_4          0.8313253
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Canada"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8380952  Canada
## 2                 lea              0.8625932  Canada
favorability_score$Country <- "Canada" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8380952  Canada
## 1            eng_1          0.9285714  Canada
## 2            eng_2          0.8333333  Canada
## 3            eng_3          0.8690476  Canada
## 4            eng_4          0.7619048  Canada
## 5            eng_5          0.7976190  Canada
## 11             lea          0.8625932  Canada
## 6            lea_1          0.8452381  Canada
## 7            lea_2          0.8333333  Canada
## 8            lea_3          0.9404762  Canada
## 9            lea_4          0.8313253  Canada
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8380952  Canada
## 1            eng_1          0.9285714  Canada
## 2            eng_2          0.8333333  Canada
## 3            eng_3          0.8690476  Canada
## 4            eng_4          0.7619048  Canada
## 5            eng_5          0.7976190  Canada
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Canadian Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))

- c. China’s N = 60

df_2[df_2$country == "China" & !is.na(df_2$country), ]
## # A tibble: 60 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M000…     5     4     4     3     5     3     5     5     5     5     5     5
##  2 M012…     5     5     4     5     4     4     4     4     4     5     5     4
##  3 M019…     4     4     4     4     4     4     4     4     4     4     5     4
##  4 M012…     4     2     2     4     5     3     4     3     4     3     3     5
##  5 M021…     4     2     3     3     3     1     3     3     3     2     3     3
##  6 M001…     4     4     4     2     3     3     4     3     5     4     4     4
##  7 M020…     4     5     4     5     4     4     5     5     5     5     5     5
##  8 M014…     4     4     4     5     3     3     5     5     3     4     5     5
##  9 M022…     5     5     4     5     5     5     5     5     5     5     5     5
## 10 M004…     5     5     5     5     5     5     5     5     5     5     5     5
## # ℹ 50 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "China", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.8666667
## 2           eng_2          0.7833333
## 3           eng_3          0.7833333
## 4           eng_4          0.7833333
## 5           eng_5          0.8166667
## 6           lea_1          0.8333333
## 7           lea_2          0.8166667
## 8           lea_3          0.8833333
## 9           lea_4          0.8666667
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "China"  # Add the country label
    )
  )
}




print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8066667   China
## 2                 lea              0.8500000   China
favorability_score$Country <- "China" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8066667   China
## 1            eng_1          0.8666667   China
## 2            eng_2          0.7833333   China
## 3            eng_3          0.7833333   China
## 4            eng_4          0.7833333   China
## 5            eng_5          0.8166667   China
## 11             lea          0.8500000   China
## 6            lea_1          0.8333333   China
## 7            lea_2          0.8166667   China
## 8            lea_3          0.8833333   China
## 9            lea_4          0.8666667   China
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8066667   China
## 1            eng_1          0.8666667   China
## 2            eng_2          0.7833333   China
## 3            eng_3          0.7833333   China
## 4            eng_4          0.7833333   China
## 5            eng_5          0.8166667   China
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Chinese employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Chinese Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))

- d. Denmark’s N = 24

df_2[df_2$country == "Denmark" & !is.na(df_2$country), ]
## # A tibble: 24 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M022…     4     2     4     4     2     1     3     3     5     3     3     4
##  2 M008…     5     5     4     5     4     5     5     5     4     5     5     5
##  3 M009…     5     2     3     3     3     1     4     4     3     3     3     5
##  4 M023…     5     5     5     5     3     5     5     5     5     5     5     5
##  5 M016…     5     5     5     5    NA     4     5     5     5     5     5     5
##  6 M009…     4     4     4     4     4     3     4     4     4     4     4     5
##  7 M013…     5     4     3    NA     2     3     5     4     5     4     4     5
##  8 M012…     5     5     4     5     4     4     5     5     4     4     4     5
##  9 M014…     5    NA     3     5     4     2     5     5     5     4     4     5
## 10 M012…     5     4     4     5     3     4     5     4     3     4     4     5
## # ℹ 14 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Denmark", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9583333
## 2           eng_2          0.7916667
## 3           eng_3          0.8333333
## 4           eng_4          0.7083333
## 5           eng_5          0.7500000
## 6           lea_1          0.8333333
## 7           lea_2          0.8750000
## 8           lea_3          0.8695652
## 9           lea_4          0.8695652
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Denmark"  # Add the country label
    )
  )
}



print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8083333 Denmark
## 2                 lea              0.8618659 Denmark
favorability_score$Country <- "Denmark" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8083333 Denmark
## 1            eng_1          0.9583333 Denmark
## 2            eng_2          0.7916667 Denmark
## 3            eng_3          0.8333333 Denmark
## 4            eng_4          0.7083333 Denmark
## 5            eng_5          0.7500000 Denmark
## 11             lea          0.8618659 Denmark
## 6            lea_1          0.8333333 Denmark
## 7            lea_2          0.8750000 Denmark
## 8            lea_3          0.8695652 Denmark
## 9            lea_4          0.8695652 Denmark
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8083333 Denmark
## 1            eng_1          0.9583333 Denmark
## 2            eng_2          0.7916667 Denmark
## 3            eng_3          0.8333333 Denmark
## 4            eng_4          0.7083333 Denmark
## 5            eng_5          0.7500000 Denmark
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Danish employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Danish Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- e. France’s N = 136

df_2[df_2$country == "France" & !is.na(df_2$country), ]
## # A tibble: 136 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M014…     5     5     4     5     3     1     5     5     5     5     5     5
##  2 M006…     4     4     4     4     4     4     4     4     4     3     3     4
##  3 M004…     4     4     2     4     3     4     4     4     4     3     3     4
##  4 M011…     4     5     4     4     3     4     5     4     4     4     4     5
##  5 M002…     5     5     4     5     3     3     5     5     5     5     5     5
##  6 M009…     4     3     3     4     3     4     5     5     3     3     4     5
##  7 M024…     5     5     5     5     5     5     5     5     5     5     5     5
##  8 M005…     4     4     4     4     3     3     5     5     5     4     5     4
##  9 M018…     4     5     3     5     5     2     5     5     5     5     5     5
## 10 M006…     4     3     4     4     2     1     4     4     1     4     5     5
## # ℹ 126 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "France", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9338235
## 2           eng_2          0.8602941
## 3           eng_3          0.8750000
## 4           eng_4          0.7279412
## 5           eng_5          0.7720588
## 6           lea_1          0.9264706
## 7           lea_2          0.9044118
## 8           lea_3          0.9411765
## 9           lea_4          0.9485294
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "France"  # Add the country label
    )
  )
}



print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8338235  France
## 2                 lea              0.9301471  France
favorability_score$Country <- "France" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8338235  France
## 1            eng_1          0.9338235  France
## 2            eng_2          0.8602941  France
## 3            eng_3          0.8750000  France
## 4            eng_4          0.7279412  France
## 5            eng_5          0.7720588  France
## 11             lea          0.9301471  France
## 6            lea_1          0.9264706  France
## 7            lea_2          0.9044118  France
## 8            lea_3          0.9411765  France
## 9            lea_4          0.9485294  France
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8338235  France
## 1            eng_1          0.9338235  France
## 2            eng_2          0.8602941  France
## 3            eng_3          0.8750000  France
## 4            eng_4          0.7279412  France
## 5            eng_5          0.7720588  France
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for French employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for French Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- f. Germany’s N = 48

df_2[df_2$country == "Germany" & !is.na(df_2$country), ]
## # A tibble: 48 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M001…     5     5     4     3     3     4     5     5     4     2     3     5
##  2 M019…     4     4     4     5     5     5     5     5     4     3     5     4
##  3 M012…     4     4     3     3     3     3     3     4     4     3     4     3
##  4 M019…     3     4     4     4     2     4     4     4     3     3     3     4
##  5 M003…     4     4     5     4     4     4     5     4     5     4     4     5
##  6 M005…     5     5     3     4     2     2     3     4     4     2     3     4
##  7 M009…     4     4     4     5     3     3     4     5     4     5     4     5
##  8 M008…     5     4     3     4     3     3     5     5     4     2     4     5
##  9 M017…     5     5     5     5     3     5     5     5     5     4     4     5
## 10 M016…     5     3     3     3     4     2     4     3     3     2     3     5
## # ℹ 38 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "Germany", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.8333333
## 2           eng_2          0.8333333
## 3           eng_3          0.7083333
## 4           eng_4          0.5106383
## 5           eng_5          0.5957447
## 6           lea_1          0.8750000
## 7           lea_2          0.7916667
## 8           lea_3          0.9375000
## 9           lea_4          0.8541667
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "Germany"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.6962766 Germany
## 2                 lea              0.8645833 Germany
favorability_score$Country <- "Germany" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.6962766 Germany
## 1            eng_1          0.8333333 Germany
## 2            eng_2          0.8333333 Germany
## 3            eng_3          0.7083333 Germany
## 4            eng_4          0.5106383 Germany
## 5            eng_5          0.5957447 Germany
## 11             lea          0.8645833 Germany
## 6            lea_1          0.8750000 Germany
## 7            lea_2          0.7916667 Germany
## 8            lea_3          0.9375000 Germany
## 9            lea_4          0.8541667 Germany
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.6962766 Germany
## 1            eng_1          0.8333333 Germany
## 2            eng_2          0.8333333 Germany
## 3            eng_3          0.7083333 Germany
## 4            eng_4          0.5106383 Germany
## 5            eng_5          0.5957447 Germany
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for German employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for German Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- g. India's N = 288
df_2[df_2$country == "India" & !is.na(df_2$country), ]
## # A tibble: 288 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M014…     5     5     4     5     5     5     5     5     4     3     3     5
##  2 M020…     5     4     3     5     3     4     5     5     3     4     4     5
##  3 M002…     5     5     5     5     5     5     5     5     5     4     4     5
##  4 M024…     5     5     4     3     5     5     5     5     3     2     3     5
##  5 M017…     5     5     5     5     5     5     5     5     5     5     5     5
##  6 M017…     3     4     3     4     3     5     5     5     5     4     5     2
##  7 M016…     5     5     4     5     5     5     5     5     5     3     3     5
##  8 M006…     3     3     3     3     4     4     5     5     3     5     5     5
##  9 M026…     4     4     4     4     4     4     5     5     5     5     4     4
## 10 M023…     5     5     4     5     5     5     5     5     5     5     5     5
## # ℹ 278 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "India", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9442509
## 2           eng_2          0.9166667
## 3           eng_3          0.8767606
## 4           eng_4          0.7152778
## 5           eng_5          0.7673611
## 6           lea_1          0.8982456
## 7           lea_2          0.9125874
## 8           lea_3          0.9581882
## 9           lea_4          0.9542254
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "India"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score country
## 1                 eng              0.8440634   India
## 2                 lea              0.9308116   India
favorability_score$Country <- "India" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score Country
## 10             eng          0.8440634   India
## 1            eng_1          0.9442509   India
## 2            eng_2          0.9166667   India
## 3            eng_3          0.8767606   India
## 4            eng_4          0.7152778   India
## 5            eng_5          0.7673611   India
## 11             lea          0.9308116   India
## 6            lea_1          0.8982456   India
## 7            lea_2          0.9125874   India
## 8            lea_3          0.9581882   India
## 9            lea_4          0.9542254   India
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score Country
## 10             eng          0.8440634   India
## 1            eng_1          0.9442509   India
## 2            eng_2          0.9166667   India
## 3            eng_3          0.8767606   India
## 4            eng_4          0.7152778   India
## 5            eng_5          0.7673611   India
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for Indian employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for Indian Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- h. United Kingdom's N = 359
df_2[df_2$country == "United Kingdom" & !is.na(df_2$country), ]
## # A tibble: 359 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M003…     5     5     4     5     4     5     5     5     5     4     3     5
##  2 M014…     4     4     2     3     2     3     4     4     3     2     3     5
##  3 M020…     5     5     5     5     4     5     5     5     5     5     5     5
##  4 M023…     5     5     5     4     5     5     5     4     5     3     3     5
##  5 M019…     5     5     5     5     5     5     5     5     5     5     5     5
##  6 M010…     4     4     4     4     3     4     5     5     4     4     4     4
##  7 M000…     4     4     4     2     3     4     4     4     3     3     4     3
##  8 M013…     5     4     2     4    NA     4     5     5     4     4     4     5
##  9 M018…     3     3     2     2     1     3     4     3     3     2     3     3
## 10 M016…     5     5     4     5     3     5     5     5     5     4     5     5
## # ℹ 349 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United Kingdom", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9385475
## 2           eng_2          0.8770950
## 3           eng_3          0.8328691
## 4           eng_4          0.6796657
## 5           eng_5          0.7214485
## 6           lea_1          0.8743017
## 7           lea_2          0.8659218
## 8           lea_3          0.9220056
## 9           lea_4          0.9359331
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "United Kingdom"  # Add the country label
    )
  )
}


print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score        country
## 1                 eng              0.8099251 United Kingdom
## 2                 lea              0.8995405 United Kingdom
favorability_score$Country <- "United Kingdom" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score        Country
## 10             eng          0.8099251 United Kingdom
## 1            eng_1          0.9385475 United Kingdom
## 2            eng_2          0.8770950 United Kingdom
## 3            eng_3          0.8328691 United Kingdom
## 4            eng_4          0.6796657 United Kingdom
## 5            eng_5          0.7214485 United Kingdom
## 11             lea          0.8995405 United Kingdom
## 6            lea_1          0.8743017 United Kingdom
## 7            lea_2          0.8659218 United Kingdom
## 8            lea_3          0.9220056 United Kingdom
## 9            lea_4          0.9359331 United Kingdom
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score        Country
## 10             eng          0.8099251 United Kingdom
## 1            eng_1          0.9385475 United Kingdom
## 2            eng_2          0.8770950 United Kingdom
## 3            eng_3          0.8328691 United Kingdom
## 4            eng_4          0.6796657 United Kingdom
## 5            eng_5          0.7214485 United Kingdom
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.K. Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- i. United States' N = 1551
df_2[df_2$country == "United States" & !is.na(df_2$country), ]
## # A tibble: 1,551 × 30
##    eeid  ali_1 ali_2 ali_3 col_1 col_2 col_3 eng_1 eng_2 eng_3 eng_4 eng_5 inc_1
##    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 M008…     5     5     5     5     5     5     5     5     5     5     5     5
##  2 M001…     5     5     4     5     5     5     5     5     5     4     3     5
##  3 M017…     4     4     3     4     4     4     4     4     3     4     4     4
##  4 M012…     4     3     4     3     3     3     4     3     4     3     3     4
##  5 M025…     5     5     4     4     5     5     5     4     5     4     5     5
##  6 M010…     5     5     5     5     4     5     5     5     4     5     3     5
##  7 M017…    NA    NA    NA    NA    NA    NA     4    NA    NA    NA    NA    NA
##  8 M018…     4     4     4     2     5     5     4     1     3     4     3     4
##  9 M013…     4     5     4     5     4     5     5     5     5     4     5     5
## 10 M004…     4     5     5     4     5     5     5     3     4     4     4     4
## # ℹ 1,541 more rows
## # ℹ 17 more variables: inc_2 <dbl>, inc_3 <dbl>, inc_4 <dbl>, inc_5 <dbl>,
## #   lea_1 <dbl>, lea_2 <dbl>, lea_3 <dbl>, lea_4 <dbl>, age <chr>,
## #   hiredate <date>, race <chr>, gender <chr>, manager_status <chr>,
## #   country <chr>, hireyear <chr>, tenure_group <chr>, engagement_factor <dbl>
# Calculate favorability_score for columns 8 to 12 (engagement questions) by using function_favorability I created.
favorability_score <- function_favorability(df_2[df_2$country == "United States", c(8:12, 18:21)])
colnames(favorability_score)[1] <- "question_number"
print(favorability_score)
##   question_number favorability_score
## 1           eng_1          0.9153200
## 2           eng_2          0.8312864
## 3           eng_3          0.7692806
## 4           eng_4          0.6735751
## 5           eng_5          0.7040155
## 6           lea_1          0.8000000
## 7           lea_2          0.8273616
## 8           lea_3          0.8654971
## 9           lea_4          0.8415584
# Calculate factor favorability score for engagement & leadership 

## Extract factor abbreviations first
factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number)) #substitute "-.*" to " " then only save unique abbreviations

## Create an empty data frame to save the calculated factor_favorable_score for each factor
factor_favorable_score <- data.frame(factor_abbreviation = character(),
                                     factor_favorable_score = numeric(),
                                     stringsAsFactors = FALSE)

## Calculate the average favorable score for each factor by using this function:
for (factor_abbr in factor_abbreviations) {
  # Subset favorable scores for questions belonging to the factor by using 'factor_abbreviations' vector 
  factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    #use grep to find 'factor_abbre_" in favorability_score df
  
  # Calculate the average favorable score for the factor
  avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)  # Add na.rm = TRUE to handle missing values
  
  # Add each factor favorable score to the data frame
  factor_favorable_score <- rbind(
    factor_favorable_score,
    data.frame(
      factor_abbreviation = factor_abbr,
      factor_favorable_score = avg_favorable_score,
      country = "United States"  # Add the country label
    )
  )
}

print(factor_favorable_score)
##   factor_abbreviation factor_favorable_score       country
## 1                 eng              0.7786955 United States
## 2                 lea              0.8336043 United States
favorability_score$Country <- "United States" 

#let's combine these two dfs
colnames(factor_favorable_score) <- c("question_number", "favorability_score", "Country")
combined_favorability_df <- rbind(favorability_score, factor_favorable_score)
combined_favorability_df <- combined_favorability_df[order(combined_favorability_df$question_number), ]
print(combined_favorability_df)
##    question_number favorability_score       Country
## 10             eng          0.7786955 United States
## 1            eng_1          0.9153200 United States
## 2            eng_2          0.8312864 United States
## 3            eng_3          0.7692806 United States
## 4            eng_4          0.6735751 United States
## 5            eng_5          0.7040155 United States
## 11             lea          0.8336043 United States
## 6            lea_1          0.8000000 United States
## 7            lea_2          0.8273616 United States
## 8            lea_3          0.8654971 United States
## 9            lea_4          0.8415584 United States
#let's only keep engagement scores

engagement_favorable_scores <- combined_favorability_df[1:6, ] 
print(engagement_favorable_scores)
##    question_number favorability_score       Country
## 10             eng          0.7786955 United States
## 1            eng_1          0.9153200 United States
## 2            eng_2          0.8312864 United States
## 3            eng_3          0.7692806 United States
## 4            eng_4          0.6735751 United States
## 5            eng_5          0.7040155 United States
# Distinguish the average row from other item rows
engagement_favorable_scores$highlight <- ifelse(engagement_favorable_scores$question_number == engagement_favorable_scores $question_number[1], "first", "other")

# Create a color palette to specify the average factor favorable score from others
highlight_colors <- c("first" = "#5E81AC",  # blue for the 'first' category
                      "other" = "#AEBAC2")  # Grey for the 'other' category


library(scales)

# Create a ggplot that shows engagement factor favorable scores for U.S. employees
ggplot(engagement_favorable_scores, aes(x = question_number, y = favorability_score, fill = highlight)) +
  geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +  
  labs(x = "Factor Items", y = "Favorability Score (%)", title = "Engagement Factor Favorable Scores for U.S. Employees") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 12),  
    plot.title = element_text(hjust = 0.5, size = 16), 
    axis.title.x = element_text(size = 14), 
    axis.title.y = element_text(size = 14) 
  ) +
  geom_text(aes(label = paste0(round(favorability_score * 100), "%")), vjust = -0.5, size = 4) + 
  scale_fill_manual(values = highlight_colors) +  
  scale_y_continuous(labels = percent_format(accuracy = 1))  

- j. create a visual that compares factor favorable scores for engagement & leadership across countries

#Create an empty data frame to save the consolidated favorable_scores
consolidated_favorable_scores <- data.frame(
  factor_abbreviation = character(),
  factor_favorable_score = numeric(),
  country = character(),
  stringsAsFactors = FALSE
)

#list of countries
countries <- c("Australia", "Canada", "China", "Denmark", "France", "Germany", "India", "United Kingdom", "United States")  

#Loop through each country and calculate factor favorable score
for (country in countries) {
  #Filter data for the specific country
  country_df <- df_2[df_2$country == country & !is.na(df_2$country), ]
  
  #Calculate favorability_score using a custom function
  favorability_score <- function_favorability(country_df[, c(8:12, 18:21)])
  colnames(favorability_score)[1] <- "question_number"
  
  factor_abbreviations <- unique(sub("_.*", "", favorability_score$question_number))
  
  #Create a data frame for this country's factor_favorable_score (so a dataframe within a loop)
  factor_favorable_score <- data.frame(
    factor_abbreviation = character(),
    factor_favorable_score = numeric(),
    country = character(),
    stringsAsFactors = FALSE
  )
  
  #Calculate the average favorable score for each factor
  for (factor_abbr in factor_abbreviations) {
    #Subset favorable scores for the factor
    factor_scores <- favorability_score[grep(paste0("^", factor_abbr, "_"), favorability_score$question_number), ]
    
    #Calculate the average favorable score
    avg_favorable_score <- mean(factor_scores$favorability_score, na.rm = TRUE)
    
    #Append the calculated score to the country's data frame
    factor_favorable_score <- rbind(
      factor_favorable_score,
      data.frame(
        factor_abbreviation = factor_abbr,
        factor_favorable_score = avg_favorable_score,
        country = country  # Add the country label
      )
    )
  }
  
  #Consolidate this country's data frame into the main consolidated data frame
  consolidated_favorable_scores <- rbind(
    consolidated_favorable_scores,
    factor_favorable_score
  )
}

print(consolidated_favorable_scores)
##    factor_abbreviation factor_favorable_score        country
## 1                  eng              0.8277228      Australia
## 2                  lea              0.9257426      Australia
## 3                  eng              0.8380952         Canada
## 4                  lea              0.8625932         Canada
## 5                  eng              0.8066667          China
## 6                  lea              0.8500000          China
## 7                  eng              0.8083333        Denmark
## 8                  lea              0.8618659        Denmark
## 9                  eng              0.8338235         France
## 10                 lea              0.9301471         France
## 11                 eng              0.6962766        Germany
## 12                 lea              0.8645833        Germany
## 13                 eng              0.8440634          India
## 14                 lea              0.9308116          India
## 15                 eng              0.8099251 United Kingdom
## 16                 lea              0.8995405 United Kingdom
## 17                 eng              0.7786955  United States
## 18                 lea              0.8336043  United States
#Sort the data frame by factor_favorable_score 
consolidated_favorable_scores <- consolidated_favorable_scores %>%
  arrange(desc(factor_favorable_score))

#Reorder the factor levels based on the sorted data frame
consolidated_favorable_scores$factor_abbreviation <- factor(
  consolidated_favorable_scores$factor_abbreviation,
  levels = unique(consolidated_favorable_scores$factor_abbreviation)  
)
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(scales)

# Filter the data into two subsets: one for 'lea' (leadership factor) and one for 'eng' (engagement factor)
leadership_scores <- consolidated_favorable_scores %>%
  filter(factor_abbreviation == "lea")

engagement_scores <- consolidated_favorable_scores %>%
  filter(factor_abbreviation == "eng")

# Reorder the factor levels based on the sorted data frame
leadership_scores$country <- factor(
  leadership_scores$country,
  levels = unique(leadership_scores$country)
)

engagement_scores$country <- factor(
  engagement_scores$country,
  levels = unique(engagement_scores$country)
)

# Create ggplot for leadership scores
ggplot(leadership_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
    position = position_dodge(width = 0.9),
    vjust = -0.5  # Position the text above the bars
  ) +
  labs(
    x = "Country",
    y = "Leadership Favorability Score (%)",
    title = "Leadership Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
  )

# Create ggplot for engagement scores
ggplot(engagement_scores, aes(x = country, y = factor_favorable_score * 100, fill = country)) +
  geom_bar(stat = "identity", position = "dodge") +  # Bars side-by-side
  geom_text(
    aes(label = paste0(round(factor_favorable_score * 100, 1), "%")),
    position = position_dodge(width = 0.9),
    vjust = -0.5  # Position the text above the bars
  ) +
  labs(
    x = "Country",
    y = "Engagement Favorability Score (%)",
    title = "Engagement Factor Favorable Scores by Country (%)"
  ) +
  scale_y_continuous(labels = percent_format(scale = 1)) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
  )